howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26import cyvcf2 27import pyBigWig 28import math 29 30from howard.functions.commons import * 31from howard.objects.database import * 32from howard.functions.databases import * 33from howard.functions.utils import * 34 35 36class Variants: 37 38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data() 85 86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples 104 105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples 112 113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True) 123 124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "") 149 150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config 163 164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param 174 175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = [] 203 204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False) 212 213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config 254 255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict 279 280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db 309 310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 355 356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None 382 383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None 484 485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df 526 527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None 569 570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats 791 792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file 814 815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None 916 917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input 923 924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format 940 941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed 958 959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output 966 967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format 985 986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config 992 993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param 999 1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db 1006 1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix 1013 1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants 1041 1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 ) 1053 1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory") 1061 1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn 1069 1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close() 1076 1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required 1096 1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list 1110 1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0 1129 1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return "" 1140 1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return [] 1151 1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list) 1162 1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list 1221 1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False 1240 1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False) 1249 1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format 1261 1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1315 1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes() 1511 1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False) 1521 1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return [] 1621 1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix 1640 1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column 1712 1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed 1770 1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns 1987 1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index) 2014 2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index) 2040 2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list 2055 2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f) 2074 2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None 2086 2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 fields_to_rename: dict | None = None 2100 ) -> bool: 2101 """ 2102 The `export_output` function exports data from a VCF file to various formats, including VCF, 2103 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2104 partitioning. 2105 2106 :param output_file: The `output_file` parameter is a string that specifies the name of the 2107 output file where the exported data will be saved 2108 :type output_file: str | None 2109 :param output_header: The `output_header` parameter is a string that specifies the name of the 2110 file where the header of the VCF file will be exported. If this parameter is not provided, the 2111 header will be exported to a file with the same name as the `output_file` parameter, but with 2112 the extension " 2113 :type output_header: str | None 2114 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2115 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2116 True, the header will be exported to a file. If `export_header` is False, the header will not 2117 be, defaults to True 2118 :type export_header: bool (optional) 2119 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2120 that can be used to filter and select specific data from the VCF file before exporting it. If 2121 provided, only the data that matches the query will be exported. This allows you to customize 2122 the exported data based on 2123 :type query: str | None 2124 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2125 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2126 organize data in a hierarchical directory structure based on the values of one or more columns. 2127 This can improve query performance when working with large datasets 2128 :type parquet_partitions: list | None 2129 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2130 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2131 multiple files. It helps in optimizing the export process by breaking down the data into 2132 manageable chunks for processing and storage 2133 :type chunk_size: int | None 2134 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2135 threads to be used during the export process. It determines the level of parallelism and can 2136 improve the performance of the export operation. If this parameter is not provided, the function 2137 will use the default number of threads 2138 :type threads: int | None 2139 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2140 determines whether the output file should be sorted based on genomic coordinates of the 2141 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2142 `False`,, defaults to False 2143 :type sort: bool (optional) 2144 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2145 determines whether an index should be created on the output file. If `index` is set to `True`, 2146 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2147 :type index: bool (optional) 2148 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2149 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2150 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2151 output file should be 2152 :type order_by: str | None 2153 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2154 mapping of field names to be renamed during the export process. This parameter allows you to 2155 customize the output field names before exporting the data. Each key-value pair in the 2156 dictionary represents the original field name as the key and the new field name 2157 :type fields_to_rename: dict | None 2158 :return: The `export_output` function returns a boolean value. It checks if the output file 2159 exists and returns True if it does, or None if it doesn't. 2160 """ 2161 2162 # Log 2163 log.info("Exporting...") 2164 2165 # Full path 2166 output_file = full_path(output_file) 2167 output_header = full_path(output_header) 2168 2169 # Config 2170 config = self.get_config() 2171 2172 # Param 2173 param = self.get_param() 2174 2175 # Tmp files to remove 2176 tmp_to_remove = [] 2177 2178 # If no output, get it 2179 if not output_file: 2180 output_file = self.get_output() 2181 2182 # If not threads 2183 if not threads: 2184 threads = self.get_threads() 2185 2186 # Rename fields 2187 if not fields_to_rename: 2188 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2189 self.rename_info_fields(fields_to_rename=fields_to_rename) 2190 2191 # Auto header name with extension 2192 if export_header or output_header: 2193 if not output_header: 2194 output_header = f"{output_file}.hdr" 2195 # Export header 2196 self.export_header(output_file=output_file) 2197 2198 # Switch off export header if VCF output 2199 output_file_type = get_file_format(output_file) 2200 if output_file_type in ["vcf"]: 2201 export_header = False 2202 tmp_to_remove.append(output_header) 2203 2204 # Chunk size 2205 if not chunk_size: 2206 chunk_size = config.get("chunk_size", None) 2207 2208 # Parquet partition 2209 if not parquet_partitions: 2210 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2211 if parquet_partitions and isinstance(parquet_partitions, str): 2212 parquet_partitions = parquet_partitions.split(",") 2213 2214 # Order by 2215 if not order_by: 2216 order_by = param.get("export", {}).get("order_by", "") 2217 2218 # Header in output 2219 header_in_output = param.get("export", {}).get("include_header", False) 2220 2221 # Database 2222 database_source = self.get_connexion() 2223 2224 # Connexion format 2225 connexion_format = self.get_connexion_format() 2226 2227 # Explode infos 2228 if self.get_explode_infos(): 2229 self.explode_infos( 2230 prefix=self.get_explode_infos_prefix(), 2231 fields=self.get_explode_infos_fields(), 2232 force=False, 2233 ) 2234 2235 # if connexion_format in ["sqlite"] or query: 2236 if connexion_format in ["sqlite"]: 2237 2238 # Export in Parquet 2239 random_tmp = "".join( 2240 random.choice(string.ascii_lowercase) for i in range(10) 2241 ) 2242 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2243 tmp_to_remove.append(database_source) 2244 2245 # Table Variants 2246 table_variants = self.get_table_variants() 2247 2248 # Create export query 2249 sql_query_export_subquery = f""" 2250 SELECT * FROM {table_variants} 2251 """ 2252 2253 # Write source file 2254 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2255 2256 # Create database 2257 database = Database( 2258 database=database_source, 2259 table="variants", 2260 header_file=output_header, 2261 conn_config=self.get_connexion_config(), 2262 ) 2263 2264 # Existing colomns header 2265 existing_columns_header = database.get_header_columns_from_database(query=query) 2266 2267 # Sample list 2268 if output_file_type in ["vcf"]: 2269 get_samples = self.get_samples() 2270 get_samples_check = self.get_samples_check() 2271 samples_force = get_samples is not None 2272 sample_list = self.get_header_sample_list( 2273 check=get_samples_check, 2274 samples=get_samples, 2275 samples_force=samples_force, 2276 ) 2277 else: 2278 sample_list = None 2279 2280 # Export file 2281 database.export( 2282 output_database=output_file, 2283 output_header=output_header, 2284 existing_columns_header=existing_columns_header, 2285 parquet_partitions=parquet_partitions, 2286 chunk_size=chunk_size, 2287 threads=threads, 2288 sort=sort, 2289 index=index, 2290 header_in_output=header_in_output, 2291 order_by=order_by, 2292 query=query, 2293 export_header=export_header, 2294 sample_list=sample_list, 2295 ) 2296 2297 # Remove 2298 remove_if_exists(tmp_to_remove) 2299 2300 return (os.path.exists(output_file) or None) and ( 2301 os.path.exists(output_file) or None 2302 ) 2303 2304 def get_extra_infos(self, table: str = None) -> list: 2305 """ 2306 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2307 in the header. 2308 2309 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2310 name of the table from which you want to retrieve the extra columns that are not present in the 2311 header. If the `table` parameter is not provided when calling the function, it will default to 2312 using the variants 2313 :type table: str 2314 :return: A list of columns that are in the specified table but not in the header of the table. 2315 """ 2316 2317 header_columns = [] 2318 2319 if not table: 2320 table = self.get_table_variants(clause="from") 2321 header_columns = self.get_header_columns() 2322 2323 # Check all columns in the database 2324 query = f""" SELECT * FROM {table} LIMIT 1 """ 2325 log.debug(f"query {query}") 2326 table_columns = self.get_query_to_df(query).columns.tolist() 2327 extra_columns = [] 2328 2329 # Construct extra infos (not in header) 2330 for column in table_columns: 2331 if column not in header_columns: 2332 extra_columns.append(column) 2333 2334 return extra_columns 2335 2336 def get_extra_infos_sql(self, table: str = None) -> str: 2337 """ 2338 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2339 by double quotes 2340 2341 :param table: The name of the table to get the extra infos from. If None, the default table is 2342 used 2343 :type table: str 2344 :return: A string of the extra infos 2345 """ 2346 2347 return ", ".join( 2348 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2349 ) 2350 2351 def export_header( 2352 self, 2353 header_name: str = None, 2354 output_file: str = None, 2355 output_file_ext: str = ".hdr", 2356 clean_header: bool = True, 2357 remove_chrom_line: bool = False, 2358 ) -> str: 2359 """ 2360 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2361 specified options, and writes it to a new file. 2362 2363 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2364 this parameter is not specified, the header will be written to the output file 2365 :type header_name: str 2366 :param output_file: The `output_file` parameter in the `export_header` function is used to 2367 specify the name of the output file where the header will be written. If this parameter is not 2368 provided, the header will be written to a temporary file 2369 :type output_file: str 2370 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2371 string that represents the extension of the output header file. By default, it is set to ".hdr" 2372 if not specified by the user. This extension will be appended to the `output_file` name to 2373 create the final, defaults to .hdr 2374 :type output_file_ext: str (optional) 2375 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2376 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2377 `True`, the function will clean the header by modifying certain lines based on a specific 2378 pattern. If `clean_header`, defaults to True 2379 :type clean_header: bool (optional) 2380 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2381 boolean flag that determines whether the #CHROM line should be removed from the header before 2382 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2383 defaults to False 2384 :type remove_chrom_line: bool (optional) 2385 :return: The function `export_header` returns the name of the temporary header file that is 2386 created. 2387 """ 2388 2389 if not header_name and not output_file: 2390 output_file = self.get_output() 2391 2392 if self.get_header(): 2393 2394 # Get header object 2395 header_obj = self.get_header() 2396 2397 # Create database 2398 db_for_header = Database(database=self.get_input()) 2399 2400 # Get real columns in the file 2401 db_header_columns = db_for_header.get_columns() 2402 2403 with tempfile.TemporaryDirectory() as tmpdir: 2404 2405 # Write header file 2406 header_file_tmp = os.path.join(tmpdir, "header") 2407 f = open(header_file_tmp, "w") 2408 vcf.Writer(f, header_obj) 2409 f.close() 2410 2411 # Replace #CHROM line with rel columns 2412 header_list = db_for_header.read_header_file( 2413 header_file=header_file_tmp 2414 ) 2415 header_list[-1] = "\t".join(db_header_columns) 2416 2417 # Remove CHROM line 2418 if remove_chrom_line: 2419 header_list.pop() 2420 2421 # Clean header 2422 if clean_header: 2423 header_list_clean = [] 2424 for head in header_list: 2425 # Clean head for malformed header 2426 head_clean = head 2427 head_clean = re.subn( 2428 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2429 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2430 head_clean, 2431 2, 2432 )[0] 2433 # Write header 2434 header_list_clean.append(head_clean) 2435 header_list = header_list_clean 2436 2437 tmp_header_name = output_file + output_file_ext 2438 2439 f = open(tmp_header_name, "w") 2440 for line in header_list: 2441 f.write(line) 2442 f.close() 2443 2444 return tmp_header_name 2445 2446 def export_variant_vcf( 2447 self, 2448 vcf_file, 2449 remove_info: bool = False, 2450 add_samples: bool = True, 2451 list_samples: list = [], 2452 where_clause: str = "", 2453 index: bool = False, 2454 threads: int | None = None, 2455 ) -> bool | None: 2456 """ 2457 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2458 remove INFO field, add samples, and control compression and indexing. 2459 2460 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2461 written to. It is the output file that will contain the filtered VCF data based on the specified 2462 parameters 2463 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2464 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2465 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2466 in, defaults to False 2467 :type remove_info: bool (optional) 2468 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2469 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2470 If set to False, the samples will be removed. The default value is True, defaults to True 2471 :type add_samples: bool (optional) 2472 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2473 in the output VCF file. By default, all samples will be included. If you provide a list of 2474 samples, only those samples will be included in the output file 2475 :type list_samples: list 2476 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2477 determines whether or not to create an index for the output VCF file. If `index` is set to 2478 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2479 :type index: bool (optional) 2480 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2481 number of threads to use for exporting the VCF file. It determines how many parallel threads 2482 will be used during the export process. More threads can potentially speed up the export process 2483 by utilizing multiple cores of the processor. If 2484 :type threads: int | None 2485 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2486 method with various parameters including the output file, query, threads, sort flag, and index 2487 flag. The `export_output` method is responsible for exporting the VCF data based on the 2488 specified parameters and configurations provided in the `export_variant_vcf` function. 2489 """ 2490 2491 # Config 2492 config = self.get_config() 2493 2494 # Extract VCF 2495 log.debug("Export VCF...") 2496 2497 # Table variants 2498 table_variants = self.get_table_variants() 2499 2500 # Threads 2501 if not threads: 2502 threads = self.get_threads() 2503 2504 # Info fields 2505 if remove_info: 2506 if not isinstance(remove_info, str): 2507 remove_info = "." 2508 info_field = f"""'{remove_info}' as INFO""" 2509 else: 2510 info_field = "INFO" 2511 2512 # Samples fields 2513 if add_samples: 2514 if not list_samples: 2515 list_samples = self.get_header_sample_list() 2516 if list_samples: 2517 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2518 else: 2519 samples_fields = "" 2520 log.debug(f"samples_fields: {samples_fields}") 2521 else: 2522 samples_fields = "" 2523 2524 # Where clause 2525 if where_clause is None: 2526 where_clause = "" 2527 2528 # Variants 2529 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2530 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2531 log.debug(f"sql_query_select={sql_query_select}") 2532 2533 return self.export_output( 2534 output_file=vcf_file, 2535 output_header=None, 2536 export_header=True, 2537 query=sql_query_select, 2538 parquet_partitions=None, 2539 chunk_size=config.get("chunk_size", None), 2540 threads=threads, 2541 sort=True, 2542 index=index, 2543 order_by=None, 2544 ) 2545 2546 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2547 """ 2548 It takes a list of commands and runs them in parallel using the number of threads specified 2549 2550 :param commands: A list of commands to run 2551 :param threads: The number of threads to use, defaults to 1 (optional) 2552 """ 2553 2554 run_parallel_commands(commands, threads) 2555 2556 def get_threads(self, default: int = 1) -> int: 2557 """ 2558 This function returns the number of threads to use for a job, with a default value of 1 if not 2559 specified. 2560 2561 :param default: The `default` parameter in the `get_threads` method is used to specify the 2562 default number of threads to use if no specific value is provided. If no value is provided for 2563 the `threads` parameter in the configuration or input parameters, the `default` value will be 2564 used, defaults to 1 2565 :type default: int (optional) 2566 :return: the number of threads to use for the current job. 2567 """ 2568 2569 # Config 2570 config = self.get_config() 2571 2572 # Param 2573 param = self.get_param() 2574 2575 # Input threads 2576 input_thread = param.get("threads", config.get("threads", None)) 2577 2578 # Check threads 2579 if not input_thread: 2580 threads = default 2581 elif int(input_thread) <= 0: 2582 threads = os.cpu_count() 2583 else: 2584 threads = int(input_thread) 2585 return threads 2586 2587 def get_memory(self, default: str = None) -> str: 2588 """ 2589 This function retrieves the memory value from parameters or configuration with a default value 2590 if not found. 2591 2592 :param default: The `get_memory` function takes in a default value as a string parameter. This 2593 default value is used as a fallback in case the `memory` parameter is not provided in the 2594 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2595 the function 2596 :type default: str 2597 :return: The `get_memory` function returns a string value representing the memory parameter. If 2598 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2599 return the default value provided as an argument to the function. 2600 """ 2601 2602 # Config 2603 config = self.get_config() 2604 2605 # Param 2606 param = self.get_param() 2607 2608 # Input threads 2609 input_memory = param.get("memory", config.get("memory", None)) 2610 2611 # Check threads 2612 if input_memory: 2613 memory = input_memory 2614 else: 2615 memory = default 2616 2617 return memory 2618 2619 def update_from_vcf(self, vcf_file: str) -> None: 2620 """ 2621 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2622 2623 :param vcf_file: the path to the VCF file 2624 """ 2625 2626 connexion_format = self.get_connexion_format() 2627 2628 if connexion_format in ["duckdb"]: 2629 self.update_from_vcf_duckdb(vcf_file) 2630 elif connexion_format in ["sqlite"]: 2631 self.update_from_vcf_sqlite(vcf_file) 2632 2633 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2634 """ 2635 It takes a VCF file and updates the INFO column of the variants table in the database with the 2636 INFO column of the VCF file 2637 2638 :param vcf_file: the path to the VCF file 2639 """ 2640 2641 # varaints table 2642 table_variants = self.get_table_variants() 2643 2644 # Loading VCF into temporaire table 2645 skip = self.get_header_length(file=vcf_file) 2646 vcf_df = pd.read_csv( 2647 vcf_file, 2648 sep="\t", 2649 engine="c", 2650 skiprows=skip, 2651 header=0, 2652 low_memory=False, 2653 ) 2654 sql_query_update = f""" 2655 UPDATE {table_variants} as table_variants 2656 SET INFO = concat( 2657 CASE 2658 WHEN INFO NOT IN ('', '.') 2659 THEN INFO 2660 ELSE '' 2661 END, 2662 ( 2663 SELECT 2664 concat( 2665 CASE 2666 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2667 THEN ';' 2668 ELSE '' 2669 END 2670 , 2671 CASE 2672 WHEN table_parquet.INFO NOT IN ('','.') 2673 THEN table_parquet.INFO 2674 ELSE '' 2675 END 2676 ) 2677 FROM vcf_df as table_parquet 2678 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2679 AND table_parquet.\"POS\" = table_variants.\"POS\" 2680 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2681 AND table_parquet.\"REF\" = table_variants.\"REF\" 2682 AND table_parquet.INFO NOT IN ('','.') 2683 ) 2684 ) 2685 ; 2686 """ 2687 self.conn.execute(sql_query_update) 2688 2689 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2690 """ 2691 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2692 table, then updates the INFO column of the variants table with the INFO column of the temporary 2693 table 2694 2695 :param vcf_file: The path to the VCF file you want to update the database with 2696 """ 2697 2698 # Create a temporary table for the VCF 2699 table_vcf = "tmp_vcf" 2700 sql_create = ( 2701 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2702 ) 2703 self.conn.execute(sql_create) 2704 2705 # Loading VCF into temporaire table 2706 vcf_df = pd.read_csv( 2707 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2708 ) 2709 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2710 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2711 2712 # Update table 'variants' with VCF data 2713 # warning: CONCAT as || operator 2714 sql_query_update = f""" 2715 UPDATE variants as table_variants 2716 SET INFO = CASE 2717 WHEN INFO NOT IN ('', '.') 2718 THEN INFO 2719 ELSE '' 2720 END || 2721 ( 2722 SELECT 2723 CASE 2724 WHEN table_variants.INFO NOT IN ('','.') 2725 AND table_vcf.INFO NOT IN ('','.') 2726 THEN ';' 2727 ELSE '' 2728 END || 2729 CASE 2730 WHEN table_vcf.INFO NOT IN ('','.') 2731 THEN table_vcf.INFO 2732 ELSE '' 2733 END 2734 FROM {table_vcf} as table_vcf 2735 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2736 AND table_vcf.\"POS\" = table_variants.\"POS\" 2737 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2738 AND table_vcf.\"REF\" = table_variants.\"REF\" 2739 ) 2740 """ 2741 self.conn.execute(sql_query_update) 2742 2743 # Drop temporary table 2744 sql_drop = f"DROP TABLE {table_vcf}" 2745 self.conn.execute(sql_drop) 2746 2747 def drop_variants_table(self) -> None: 2748 """ 2749 > This function drops the variants table 2750 """ 2751 2752 table_variants = self.get_table_variants() 2753 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2754 self.conn.execute(sql_table_variants) 2755 2756 def set_variant_id( 2757 self, variant_id_column: str = "variant_id", force: bool = None 2758 ) -> str: 2759 """ 2760 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2761 `#CHROM`, `POS`, `REF`, and `ALT` columns 2762 2763 :param variant_id_column: The name of the column to be created in the variants table, defaults 2764 to variant_id 2765 :type variant_id_column: str (optional) 2766 :param force: If True, the variant_id column will be created even if it already exists 2767 :type force: bool 2768 :return: The name of the column that contains the variant_id 2769 """ 2770 2771 # Assembly 2772 assembly = self.get_param().get( 2773 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2774 ) 2775 2776 # INFO/Tag prefix 2777 prefix = self.get_explode_infos_prefix() 2778 2779 # Explode INFO/SVTYPE 2780 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2781 2782 # variants table 2783 table_variants = self.get_table_variants() 2784 2785 # variant_id column 2786 if not variant_id_column: 2787 variant_id_column = "variant_id" 2788 2789 # Creta variant_id column 2790 if "variant_id" not in self.get_extra_infos() or force: 2791 2792 # Create column 2793 self.add_column( 2794 table_name=table_variants, 2795 column_name=variant_id_column, 2796 column_type="UBIGINT", 2797 default_value="0", 2798 ) 2799 2800 # Update column 2801 self.conn.execute( 2802 f""" 2803 UPDATE {table_variants} 2804 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2805 """ 2806 ) 2807 2808 # Remove added columns 2809 for added_column in added_columns: 2810 self.drop_column(column=added_column) 2811 2812 # return variant_id column name 2813 return variant_id_column 2814 2815 def get_variant_id_column( 2816 self, variant_id_column: str = "variant_id", force: bool = None 2817 ) -> str: 2818 """ 2819 This function returns the variant_id column name 2820 2821 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2822 defaults to variant_id 2823 :type variant_id_column: str (optional) 2824 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2825 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2826 if it is not already set, or if it is set 2827 :type force: bool 2828 :return: The variant_id column name. 2829 """ 2830 2831 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2832 2833 ### 2834 # Annotation 2835 ### 2836 2837 def scan_databases( 2838 self, 2839 database_formats: list = ["parquet"], 2840 database_releases: list = ["current"], 2841 ) -> dict: 2842 """ 2843 The function `scan_databases` scans for available databases based on specified formats and 2844 releases. 2845 2846 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2847 of the databases to be scanned. In this case, the accepted format is "parquet" 2848 :type database_formats: list ["parquet"] 2849 :param database_releases: The `database_releases` parameter is a list that specifies the 2850 releases of the databases to be scanned. In the provided function, the default value for 2851 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2852 databases that are in the "current" 2853 :type database_releases: list 2854 :return: The function `scan_databases` returns a dictionary containing information about 2855 databases that match the specified formats and releases. 2856 """ 2857 2858 # Config 2859 config = self.get_config() 2860 2861 # Param 2862 param = self.get_param() 2863 2864 # Param - Assembly 2865 assembly = param.get("assembly", config.get("assembly", None)) 2866 if not assembly: 2867 assembly = DEFAULT_ASSEMBLY 2868 log.warning(f"Default assembly '{assembly}'") 2869 2870 # Scan for availabled databases 2871 log.info( 2872 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2873 ) 2874 databases_infos_dict = databases_infos( 2875 database_folder_releases=database_releases, 2876 database_formats=database_formats, 2877 assembly=assembly, 2878 config=config, 2879 ) 2880 log.info( 2881 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2882 ) 2883 2884 return databases_infos_dict 2885 2886 def annotation(self) -> None: 2887 """ 2888 It annotates the VCF file with the annotations specified in the config file. 2889 """ 2890 2891 # Config 2892 config = self.get_config() 2893 2894 # Param 2895 param = self.get_param() 2896 2897 # Param - Assembly 2898 assembly = param.get("assembly", config.get("assembly", None)) 2899 if not assembly: 2900 assembly = DEFAULT_ASSEMBLY 2901 log.warning(f"Default assembly '{assembly}'") 2902 2903 # annotations databases folders 2904 annotations_databases = set( 2905 config.get("folders", {}) 2906 .get("databases", {}) 2907 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2908 + config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("parquet", ["~/howard/databases/parquet/current"]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2914 ) 2915 2916 # Get param annotations 2917 if param.get("annotations", None) and isinstance( 2918 param.get("annotations", None), str 2919 ): 2920 log.debug(param.get("annotations", None)) 2921 param_annotation_list = param.get("annotations").split(",") 2922 else: 2923 param_annotation_list = [] 2924 2925 # Each tools param 2926 if param.get("annotation_parquet", None) != None: 2927 log.debug( 2928 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2929 ) 2930 if isinstance(param.get("annotation_parquet", None), list): 2931 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2932 else: 2933 param_annotation_list.append(param.get("annotation_parquet")) 2934 if param.get("annotation_snpsift", None) != None: 2935 if isinstance(param.get("annotation_snpsift", None), list): 2936 param_annotation_list.append( 2937 "snpsift:" 2938 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2939 ) 2940 else: 2941 param_annotation_list.append( 2942 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2943 ) 2944 if param.get("annotation_snpeff", None) != None: 2945 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2946 if param.get("annotation_bcftools", None) != None: 2947 if isinstance(param.get("annotation_bcftools", None), list): 2948 param_annotation_list.append( 2949 "bcftools:" 2950 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2951 ) 2952 else: 2953 param_annotation_list.append( 2954 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2955 ) 2956 if param.get("annotation_annovar", None) != None: 2957 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2958 if param.get("annotation_exomiser", None) != None: 2959 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2960 if param.get("annotation_splice", None) != None: 2961 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2962 2963 # Merge param annotations list 2964 param["annotations"] = ",".join(param_annotation_list) 2965 2966 # debug 2967 log.debug(f"param_annotations={param['annotations']}") 2968 2969 if param.get("annotations"): 2970 2971 # Log 2972 # log.info("Annotations - Check annotation parameters") 2973 2974 if not "annotation" in param: 2975 param["annotation"] = {} 2976 2977 # List of annotations parameters 2978 annotations_list_input = {} 2979 if isinstance(param.get("annotations", None), str): 2980 annotation_file_list = [ 2981 value for value in param.get("annotations", "").split(",") 2982 ] 2983 for annotation_file in annotation_file_list: 2984 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2985 else: 2986 annotations_list_input = param.get("annotations", {}) 2987 2988 log.info(f"Quick Annotations:") 2989 for annotation_key in list(annotations_list_input.keys()): 2990 log.info(f" {annotation_key}") 2991 2992 # List of annotations and associated fields 2993 annotations_list = {} 2994 2995 for annotation_file in annotations_list_input: 2996 2997 # Explode annotations if ALL 2998 if ( 2999 annotation_file.upper() == "ALL" 3000 or annotation_file.upper().startswith("ALL:") 3001 ): 3002 3003 # check ALL parameters (formats, releases) 3004 annotation_file_split = annotation_file.split(":") 3005 database_formats = "parquet" 3006 database_releases = "current" 3007 for annotation_file_option in annotation_file_split[1:]: 3008 database_all_options_split = annotation_file_option.split("=") 3009 if database_all_options_split[0] == "format": 3010 database_formats = database_all_options_split[1].split("+") 3011 if database_all_options_split[0] == "release": 3012 database_releases = database_all_options_split[1].split("+") 3013 3014 # Scan for availabled databases 3015 databases_infos_dict = self.scan_databases( 3016 database_formats=database_formats, 3017 database_releases=database_releases, 3018 ) 3019 3020 # Add found databases in annotation parameters 3021 for database_infos in databases_infos_dict.keys(): 3022 annotations_list[database_infos] = {"INFO": None} 3023 3024 else: 3025 annotations_list[annotation_file] = annotations_list_input[ 3026 annotation_file 3027 ] 3028 3029 # Check each databases 3030 if len(annotations_list): 3031 3032 log.info( 3033 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3034 ) 3035 3036 for annotation_file in annotations_list: 3037 3038 # Init 3039 annotations = annotations_list.get(annotation_file, None) 3040 3041 # Annotation snpEff 3042 if annotation_file.startswith("snpeff"): 3043 3044 log.debug(f"Quick Annotation snpEff") 3045 3046 if "snpeff" not in param["annotation"]: 3047 param["annotation"]["snpeff"] = {} 3048 3049 if "options" not in param["annotation"]["snpeff"]: 3050 param["annotation"]["snpeff"]["options"] = "" 3051 3052 # snpEff options in annotations 3053 param["annotation"]["snpeff"]["options"] = "".join( 3054 annotation_file.split(":")[1:] 3055 ) 3056 3057 # Annotation Annovar 3058 elif annotation_file.startswith("annovar"): 3059 3060 log.debug(f"Quick Annotation Annovar") 3061 3062 if "annovar" not in param["annotation"]: 3063 param["annotation"]["annovar"] = {} 3064 3065 if "annotations" not in param["annotation"]["annovar"]: 3066 param["annotation"]["annovar"]["annotations"] = {} 3067 3068 # Options 3069 annotation_file_split = annotation_file.split(":") 3070 for annotation_file_annotation in annotation_file_split[1:]: 3071 if annotation_file_annotation: 3072 param["annotation"]["annovar"]["annotations"][ 3073 annotation_file_annotation 3074 ] = annotations 3075 3076 # Annotation Exomiser 3077 elif annotation_file.startswith("exomiser"): 3078 3079 log.debug(f"Quick Annotation Exomiser") 3080 3081 param["annotation"]["exomiser"] = params_string_to_dict( 3082 annotation_file 3083 ) 3084 3085 # Annotation Splice 3086 elif annotation_file.startswith("splice"): 3087 3088 log.debug(f"Quick Annotation Splice") 3089 3090 param["annotation"]["splice"] = params_string_to_dict( 3091 annotation_file 3092 ) 3093 3094 # Annotation Parquet or BCFTOOLS 3095 else: 3096 3097 # Tools detection 3098 if annotation_file.startswith("bcftools:"): 3099 annotation_tool_initial = "bcftools" 3100 annotation_file = ":".join(annotation_file.split(":")[1:]) 3101 elif annotation_file.startswith("snpsift:"): 3102 annotation_tool_initial = "snpsift" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("bigwig:"): 3105 annotation_tool_initial = "bigwig" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 else: 3108 annotation_tool_initial = None 3109 3110 # list of files 3111 annotation_file_list = annotation_file.replace("+", ":").split( 3112 ":" 3113 ) 3114 3115 for annotation_file in annotation_file_list: 3116 3117 if annotation_file: 3118 3119 # Annotation tool initial 3120 annotation_tool = annotation_tool_initial 3121 3122 # Find file 3123 annotation_file_found = None 3124 3125 if os.path.exists(annotation_file): 3126 annotation_file_found = annotation_file 3127 elif os.path.exists(full_path(annotation_file)): 3128 annotation_file_found = full_path(annotation_file) 3129 else: 3130 # Find within assembly folders 3131 for annotations_database in annotations_databases: 3132 found_files = find_all( 3133 annotation_file, 3134 os.path.join( 3135 annotations_database, assembly 3136 ), 3137 ) 3138 if len(found_files) > 0: 3139 annotation_file_found = found_files[0] 3140 break 3141 if not annotation_file_found and not assembly: 3142 # Find within folders 3143 for ( 3144 annotations_database 3145 ) in annotations_databases: 3146 found_files = find_all( 3147 annotation_file, annotations_database 3148 ) 3149 if len(found_files) > 0: 3150 annotation_file_found = found_files[0] 3151 break 3152 log.debug( 3153 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3154 ) 3155 3156 # Full path 3157 annotation_file_found = full_path(annotation_file_found) 3158 3159 if annotation_file_found: 3160 3161 database = Database(database=annotation_file_found) 3162 quick_annotation_format = database.get_format() 3163 quick_annotation_is_compressed = ( 3164 database.is_compressed() 3165 ) 3166 quick_annotation_is_indexed = os.path.exists( 3167 f"{annotation_file_found}.tbi" 3168 ) 3169 bcftools_preference = False 3170 3171 # Check Annotation Tool 3172 if not annotation_tool: 3173 if ( 3174 bcftools_preference 3175 and quick_annotation_format 3176 in ["vcf", "bed"] 3177 and quick_annotation_is_compressed 3178 and quick_annotation_is_indexed 3179 ): 3180 annotation_tool = "bcftools" 3181 elif quick_annotation_format in [ 3182 "vcf", 3183 "bed", 3184 "tsv", 3185 "tsv", 3186 "csv", 3187 "json", 3188 "tbl", 3189 "parquet", 3190 "duckdb", 3191 ]: 3192 annotation_tool = "parquet" 3193 elif quick_annotation_format in ["bw"]: 3194 annotation_tool = "bigwig" 3195 else: 3196 log.error( 3197 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3198 ) 3199 raise ValueError( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 3203 log.debug( 3204 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3205 ) 3206 3207 # Annotation Tool dispatch 3208 if annotation_tool: 3209 if annotation_tool not in param["annotation"]: 3210 param["annotation"][annotation_tool] = {} 3211 if ( 3212 "annotations" 3213 not in param["annotation"][annotation_tool] 3214 ): 3215 param["annotation"][annotation_tool][ 3216 "annotations" 3217 ] = {} 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ][annotation_file_found] = annotations 3221 3222 else: 3223 log.warning( 3224 f"Quick Annotation File {annotation_file} does NOT exist" 3225 ) 3226 3227 self.set_param(param) 3228 3229 if param.get("annotation", None): 3230 log.info("Annotations") 3231 if param.get("annotation", {}).get("parquet", None): 3232 log.info("Annotations 'parquet'...") 3233 self.annotation_parquet() 3234 if param.get("annotation", {}).get("bcftools", None): 3235 log.info("Annotations 'bcftools'...") 3236 self.annotation_bcftools() 3237 if param.get("annotation", {}).get("snpsift", None): 3238 log.info("Annotations 'snpsift'...") 3239 self.annotation_snpsift() 3240 if param.get("annotation", {}).get("bigwig", None): 3241 log.info("Annotations 'bigwig'...") 3242 self.annotation_bigwig() 3243 if param.get("annotation", {}).get("annovar", None): 3244 log.info("Annotations 'annovar'...") 3245 self.annotation_annovar() 3246 if param.get("annotation", {}).get("snpeff", None): 3247 log.info("Annotations 'snpeff'...") 3248 self.annotation_snpeff() 3249 if param.get("annotation", {}).get("exomiser", None) is not None: 3250 log.info("Annotations 'exomiser'...") 3251 self.annotation_exomiser() 3252 if param.get("annotation", {}).get("splice", None) is not None: 3253 log.info("Annotations 'splice' ...") 3254 self.annotation_splice() 3255 3256 # Explode INFOS fields into table fields 3257 if self.get_explode_infos(): 3258 self.explode_infos( 3259 prefix=self.get_explode_infos_prefix(), 3260 fields=self.get_explode_infos_fields(), 3261 force=True, 3262 ) 3263 3264 def annotation_bigwig(self, threads: int = None) -> None: 3265 """ 3266 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3267 3268 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3269 number of threads to be used for parallel processing during the annotation process. If the 3270 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3271 threads to use based on the system configuration 3272 :type threads: int 3273 :return: True 3274 """ 3275 3276 # DEBUG 3277 log.debug("Start annotation with bigwig databases") 3278 3279 # # Threads 3280 # if not threads: 3281 # threads = self.get_threads() 3282 # log.debug("Threads: " + str(threads)) 3283 3284 # Config 3285 config = self.get_config() 3286 log.debug("Config: " + str(config)) 3287 3288 # Config - BCFTools databases folders 3289 databases_folders = set( 3290 self.get_config() 3291 .get("folders", {}) 3292 .get("databases", {}) 3293 .get("annotations", ["."]) 3294 + self.get_config() 3295 .get("folders", {}) 3296 .get("databases", {}) 3297 .get("bigwig", ["."]) 3298 ) 3299 log.debug("Databases annotations: " + str(databases_folders)) 3300 3301 # Param 3302 annotations = ( 3303 self.get_param() 3304 .get("annotation", {}) 3305 .get("bigwig", {}) 3306 .get("annotations", None) 3307 ) 3308 log.debug("Annotations: " + str(annotations)) 3309 3310 # Assembly 3311 assembly = self.get_param().get( 3312 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3313 ) 3314 3315 # Data 3316 table_variants = self.get_table_variants() 3317 3318 # Check if not empty 3319 log.debug("Check if not empty") 3320 sql_query_chromosomes = ( 3321 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3322 ) 3323 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3324 if not sql_query_chromosomes_df["count"][0]: 3325 log.info(f"VCF empty") 3326 return 3327 3328 # VCF header 3329 vcf_reader = self.get_header() 3330 log.debug("Initial header: " + str(vcf_reader.infos)) 3331 3332 # Existing annotations 3333 for vcf_annotation in self.get_header().infos: 3334 3335 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3336 log.debug( 3337 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3338 ) 3339 3340 if annotations: 3341 3342 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3343 3344 # Export VCF file 3345 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3346 3347 # annotation_bigwig_config 3348 annotation_bigwig_config_list = [] 3349 3350 for annotation in annotations: 3351 annotation_fields = annotations[annotation] 3352 3353 # Annotation Name 3354 annotation_name = os.path.basename(annotation) 3355 3356 if not annotation_fields: 3357 annotation_fields = {"INFO": None} 3358 3359 log.debug(f"Annotation '{annotation_name}'") 3360 log.debug( 3361 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3362 ) 3363 3364 # Create Database 3365 database = Database( 3366 database=annotation, 3367 databases_folders=databases_folders, 3368 assembly=assembly, 3369 ) 3370 3371 # Find files 3372 db_file = database.get_database() 3373 db_file = full_path(db_file) 3374 db_hdr_file = database.get_header_file() 3375 db_hdr_file = full_path(db_hdr_file) 3376 db_file_type = database.get_format() 3377 3378 # If db_file is http ? 3379 if database.get_database().startswith("http"): 3380 3381 # Datbase is HTTP URL 3382 db_file_is_http = True 3383 3384 # DB file keep as URL 3385 db_file = database.get_database() 3386 log.warning( 3387 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3388 ) 3389 3390 # Retrieve automatic annotation field name 3391 annotation_field = clean_annotation_field( 3392 os.path.basename(db_file).replace(".bw", "") 3393 ) 3394 log.debug( 3395 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3396 ) 3397 3398 # Create automatic header file 3399 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3400 with open(db_hdr_file, "w") as f: 3401 f.write("##fileformat=VCFv4.2\n") 3402 f.write( 3403 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3404 ) 3405 f.write(f"#CHROM START END {annotation_field}\n") 3406 3407 else: 3408 3409 # Datbase is NOT HTTP URL 3410 db_file_is_http = False 3411 3412 # Check index - try to create if not exists 3413 if ( 3414 db_file is None 3415 or db_hdr_file is None 3416 or (not os.path.exists(db_file) and not db_file_is_http) 3417 or not os.path.exists(db_hdr_file) 3418 or not db_file_type in ["bw"] 3419 ): 3420 # if False: 3421 log.error("Annotation failed: database not valid") 3422 log.error(f"Annotation annotation file: {db_file}") 3423 log.error(f"Annotation annotation file type: {db_file_type}") 3424 log.error(f"Annotation annotation header: {db_hdr_file}") 3425 raise ValueError( 3426 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3427 ) 3428 else: 3429 3430 # Log 3431 log.debug( 3432 f"Annotation '{annotation}' - file: " 3433 + str(db_file) 3434 + " and " 3435 + str(db_hdr_file) 3436 ) 3437 3438 # Load header as VCF object 3439 db_hdr_vcf = Variants(input=db_hdr_file) 3440 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3441 log.debug( 3442 "Annotation database header: " 3443 + str(db_hdr_vcf_header_infos) 3444 ) 3445 3446 # For all fields in database 3447 annotation_fields_full = False 3448 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3449 annotation_fields = { 3450 key: key for key in db_hdr_vcf_header_infos 3451 } 3452 log.debug( 3453 "Annotation database header - All annotations added: " 3454 + str(annotation_fields) 3455 ) 3456 annotation_fields_full = True 3457 3458 # Init 3459 cyvcf2_header_rename_dict = {} 3460 cyvcf2_header_list = [] 3461 cyvcf2_header_indexes = {} 3462 3463 # process annotation fields 3464 for annotation_field in annotation_fields: 3465 3466 # New annotation name 3467 annotation_field_new = annotation_fields[annotation_field] 3468 3469 # Check annotation field and index in header 3470 if ( 3471 annotation_field 3472 in db_hdr_vcf.get_header_columns_as_list() 3473 ): 3474 annotation_field_index = ( 3475 db_hdr_vcf.get_header_columns_as_list().index( 3476 annotation_field 3477 ) 3478 - 3 3479 ) 3480 cyvcf2_header_indexes[annotation_field_new] = ( 3481 annotation_field_index 3482 ) 3483 else: 3484 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3485 log.error(msg_err) 3486 raise ValueError(msg_err) 3487 3488 # Append annotation field in cyvcf2 header list 3489 cyvcf2_header_rename_dict[annotation_field_new] = ( 3490 db_hdr_vcf_header_infos[annotation_field].id 3491 ) 3492 cyvcf2_header_list.append( 3493 { 3494 "ID": annotation_field_new, 3495 "Number": db_hdr_vcf_header_infos[ 3496 annotation_field 3497 ].num, 3498 "Type": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].type, 3501 "Description": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].desc, 3504 } 3505 ) 3506 3507 # Add header on VCF 3508 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3509 annotation_field_new, 3510 db_hdr_vcf_header_infos[annotation_field].num, 3511 db_hdr_vcf_header_infos[annotation_field].type, 3512 db_hdr_vcf_header_infos[annotation_field].desc, 3513 "HOWARD BigWig annotation", 3514 "unknown", 3515 self.code_type_map[ 3516 db_hdr_vcf_header_infos[annotation_field].type 3517 ], 3518 ) 3519 3520 # Load bigwig database 3521 bw_db = pyBigWig.open(db_file) 3522 if bw_db.isBigWig(): 3523 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3524 else: 3525 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3526 log.error(msg_err) 3527 raise ValueError(msg_err) 3528 3529 annotation_bigwig_config_list.append( 3530 { 3531 "db_file": db_file, 3532 "bw_db": bw_db, 3533 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3534 "cyvcf2_header_list": cyvcf2_header_list, 3535 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3536 } 3537 ) 3538 3539 # Annotate 3540 if annotation_bigwig_config_list: 3541 3542 # Annotation config 3543 log.debug( 3544 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3545 ) 3546 3547 # Export VCF file 3548 self.export_variant_vcf( 3549 vcf_file=tmp_vcf_name, 3550 remove_info=True, 3551 add_samples=False, 3552 index=True, 3553 ) 3554 3555 # Load input tmp file 3556 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3557 3558 # Add header in input file 3559 for annotation_bigwig_config in annotation_bigwig_config_list: 3560 for cyvcf2_header_field in annotation_bigwig_config.get( 3561 "cyvcf2_header_list", [] 3562 ): 3563 log.info( 3564 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3565 ) 3566 input_vcf.add_info_to_header(cyvcf2_header_field) 3567 3568 # Create output VCF file 3569 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3570 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3571 3572 # Fetch variants 3573 log.info(f"Annotations 'bigwig' start...") 3574 for variant in input_vcf: 3575 3576 for annotation_bigwig_config in annotation_bigwig_config_list: 3577 3578 # DB and indexes 3579 bw_db = annotation_bigwig_config.get("bw_db", None) 3580 cyvcf2_header_indexes = annotation_bigwig_config.get( 3581 "cyvcf2_header_indexes", None 3582 ) 3583 3584 # Retrieve value from chrom pos 3585 res = bw_db.values( 3586 variant.CHROM, variant.POS - 1, variant.POS 3587 ) 3588 3589 # For each annotation fields (and indexes) 3590 for cyvcf2_header_index in cyvcf2_header_indexes: 3591 3592 # If value is NOT nNone 3593 if not np.isnan( 3594 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3595 ): 3596 variant.INFO[cyvcf2_header_index] = res[ 3597 cyvcf2_header_indexes[cyvcf2_header_index] 3598 ] 3599 3600 # Add record in output file 3601 output_vcf.write_record(variant) 3602 3603 # Log 3604 log.debug(f"Annotation done.") 3605 3606 # Close and write file 3607 log.info(f"Annotations 'bigwig' write...") 3608 output_vcf.close() 3609 log.debug(f"Write done.") 3610 3611 # Update variants 3612 log.info(f"Annotations 'bigwig' update...") 3613 self.update_from_vcf(output_vcf_file) 3614 log.debug(f"Update done.") 3615 3616 return True 3617 3618 def annotation_snpsift(self, threads: int = None) -> None: 3619 """ 3620 This function annotate with bcftools 3621 3622 :param threads: Number of threads to use 3623 :return: the value of the variable "return_value". 3624 """ 3625 3626 # DEBUG 3627 log.debug("Start annotation with bcftools databases") 3628 3629 # Threads 3630 if not threads: 3631 threads = self.get_threads() 3632 log.debug("Threads: " + str(threads)) 3633 3634 # Config 3635 config = self.get_config() 3636 log.debug("Config: " + str(config)) 3637 3638 # Config - snpSift 3639 snpsift_bin_command = get_bin_command( 3640 bin="SnpSift.jar", 3641 tool="snpsift", 3642 bin_type="jar", 3643 config=config, 3644 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3645 ) 3646 if not snpsift_bin_command: 3647 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3648 log.error(msg_err) 3649 raise ValueError(msg_err) 3650 3651 # Config - bcftools 3652 bcftools_bin_command = get_bin_command( 3653 bin="bcftools", 3654 tool="bcftools", 3655 bin_type="bin", 3656 config=config, 3657 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3658 ) 3659 if not bcftools_bin_command: 3660 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3661 log.error(msg_err) 3662 raise ValueError(msg_err) 3663 3664 # Config - BCFTools databases folders 3665 databases_folders = set( 3666 self.get_config() 3667 .get("folders", {}) 3668 .get("databases", {}) 3669 .get("annotations", ["."]) 3670 + self.get_config() 3671 .get("folders", {}) 3672 .get("databases", {}) 3673 .get("bcftools", ["."]) 3674 ) 3675 log.debug("Databases annotations: " + str(databases_folders)) 3676 3677 # Param 3678 annotations = ( 3679 self.get_param() 3680 .get("annotation", {}) 3681 .get("snpsift", {}) 3682 .get("annotations", None) 3683 ) 3684 log.debug("Annotations: " + str(annotations)) 3685 3686 # Assembly 3687 assembly = self.get_param().get( 3688 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3689 ) 3690 3691 # Data 3692 table_variants = self.get_table_variants() 3693 3694 # Check if not empty 3695 log.debug("Check if not empty") 3696 sql_query_chromosomes = ( 3697 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3698 ) 3699 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3700 if not sql_query_chromosomes_df["count"][0]: 3701 log.info(f"VCF empty") 3702 return 3703 3704 # VCF header 3705 vcf_reader = self.get_header() 3706 log.debug("Initial header: " + str(vcf_reader.infos)) 3707 3708 # Existing annotations 3709 for vcf_annotation in self.get_header().infos: 3710 3711 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3712 log.debug( 3713 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3714 ) 3715 3716 if annotations: 3717 3718 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3719 3720 # Export VCF file 3721 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3722 3723 # Init 3724 commands = {} 3725 3726 for annotation in annotations: 3727 annotation_fields = annotations[annotation] 3728 3729 # Annotation Name 3730 annotation_name = os.path.basename(annotation) 3731 3732 if not annotation_fields: 3733 annotation_fields = {"INFO": None} 3734 3735 log.debug(f"Annotation '{annotation_name}'") 3736 log.debug( 3737 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3738 ) 3739 3740 # Create Database 3741 database = Database( 3742 database=annotation, 3743 databases_folders=databases_folders, 3744 assembly=assembly, 3745 ) 3746 3747 # Find files 3748 db_file = database.get_database() 3749 db_file = full_path(db_file) 3750 db_hdr_file = database.get_header_file() 3751 db_hdr_file = full_path(db_hdr_file) 3752 db_file_type = database.get_format() 3753 db_tbi_file = f"{db_file}.tbi" 3754 db_file_compressed = database.is_compressed() 3755 3756 # Check if compressed 3757 if not db_file_compressed: 3758 log.error( 3759 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3760 ) 3761 raise ValueError( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 3765 # Check if indexed 3766 if not os.path.exists(db_tbi_file): 3767 log.error( 3768 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3769 ) 3770 raise ValueError( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 3774 # Check index - try to create if not exists 3775 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3776 log.error("Annotation failed: database not valid") 3777 log.error(f"Annotation annotation file: {db_file}") 3778 log.error(f"Annotation annotation header: {db_hdr_file}") 3779 log.error(f"Annotation annotation index: {db_tbi_file}") 3780 raise ValueError( 3781 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3782 ) 3783 else: 3784 3785 log.debug( 3786 f"Annotation '{annotation}' - file: " 3787 + str(db_file) 3788 + " and " 3789 + str(db_hdr_file) 3790 ) 3791 3792 # Load header as VCF object 3793 db_hdr_vcf = Variants(input=db_hdr_file) 3794 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3795 log.debug( 3796 "Annotation database header: " 3797 + str(db_hdr_vcf_header_infos) 3798 ) 3799 3800 # For all fields in database 3801 annotation_fields_full = False 3802 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3803 annotation_fields = { 3804 key: key for key in db_hdr_vcf_header_infos 3805 } 3806 log.debug( 3807 "Annotation database header - All annotations added: " 3808 + str(annotation_fields) 3809 ) 3810 annotation_fields_full = True 3811 3812 # # Create file for field rename 3813 # log.debug("Create file for field rename") 3814 # tmp_rename = NamedTemporaryFile( 3815 # prefix=self.get_prefix(), 3816 # dir=self.get_tmp_dir(), 3817 # suffix=".rename", 3818 # delete=False, 3819 # ) 3820 # tmp_rename_name = tmp_rename.name 3821 # tmp_files.append(tmp_rename_name) 3822 3823 # Number of fields 3824 nb_annotation_field = 0 3825 annotation_list = [] 3826 annotation_infos_rename_list = [] 3827 3828 for annotation_field in annotation_fields: 3829 3830 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3831 annotation_fields_new_name = annotation_fields.get( 3832 annotation_field, annotation_field 3833 ) 3834 if not annotation_fields_new_name: 3835 annotation_fields_new_name = annotation_field 3836 3837 # Check if field is in DB and if field is not elready in input data 3838 if ( 3839 annotation_field in db_hdr_vcf.get_header().infos 3840 and annotation_fields_new_name 3841 not in self.get_header().infos 3842 ): 3843 3844 log.info( 3845 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3846 ) 3847 3848 # BCFTools annotate param to rename fields 3849 if annotation_field != annotation_fields_new_name: 3850 annotation_infos_rename_list.append( 3851 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3852 ) 3853 3854 # Add INFO field to header 3855 db_hdr_vcf_header_infos_number = ( 3856 db_hdr_vcf_header_infos[annotation_field].num or "." 3857 ) 3858 db_hdr_vcf_header_infos_type = ( 3859 db_hdr_vcf_header_infos[annotation_field].type 3860 or "String" 3861 ) 3862 db_hdr_vcf_header_infos_description = ( 3863 db_hdr_vcf_header_infos[annotation_field].desc 3864 or f"{annotation_field} description" 3865 ) 3866 db_hdr_vcf_header_infos_source = ( 3867 db_hdr_vcf_header_infos[annotation_field].source 3868 or "unknown" 3869 ) 3870 db_hdr_vcf_header_infos_version = ( 3871 db_hdr_vcf_header_infos[annotation_field].version 3872 or "unknown" 3873 ) 3874 3875 vcf_reader.infos[annotation_fields_new_name] = ( 3876 vcf.parser._Info( 3877 annotation_fields_new_name, 3878 db_hdr_vcf_header_infos_number, 3879 db_hdr_vcf_header_infos_type, 3880 db_hdr_vcf_header_infos_description, 3881 db_hdr_vcf_header_infos_source, 3882 db_hdr_vcf_header_infos_version, 3883 self.code_type_map[ 3884 db_hdr_vcf_header_infos_type 3885 ], 3886 ) 3887 ) 3888 3889 annotation_list.append(annotation_field) 3890 3891 nb_annotation_field += 1 3892 3893 else: 3894 3895 if ( 3896 annotation_field 3897 not in db_hdr_vcf.get_header().infos 3898 ): 3899 log.warning( 3900 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3901 ) 3902 if ( 3903 annotation_fields_new_name 3904 in self.get_header().infos 3905 ): 3906 log.warning( 3907 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3908 ) 3909 3910 log.info( 3911 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3912 ) 3913 3914 annotation_infos = ",".join(annotation_list) 3915 3916 if annotation_infos != "": 3917 3918 # Annotated VCF (and error file) 3919 tmp_annotation_vcf_name = os.path.join( 3920 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3921 ) 3922 tmp_annotation_vcf_name_err = ( 3923 tmp_annotation_vcf_name + ".err" 3924 ) 3925 3926 # Add fields to annotate 3927 if not annotation_fields_full: 3928 annotation_infos_option = f"-info {annotation_infos}" 3929 else: 3930 annotation_infos_option = "" 3931 3932 # Info fields rename 3933 if annotation_infos_rename_list: 3934 annotation_infos_rename = " -c " + ",".join( 3935 annotation_infos_rename_list 3936 ) 3937 else: 3938 annotation_infos_rename = "" 3939 3940 # Annotate command 3941 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3942 3943 # Add command 3944 commands[command_annotate] = tmp_annotation_vcf_name 3945 3946 if commands: 3947 3948 # Export VCF file 3949 self.export_variant_vcf( 3950 vcf_file=tmp_vcf_name, 3951 remove_info=True, 3952 add_samples=False, 3953 index=True, 3954 ) 3955 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3956 3957 # Num command 3958 nb_command = 0 3959 3960 # Annotate 3961 for command_annotate in commands: 3962 nb_command += 1 3963 log.info( 3964 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3965 ) 3966 log.debug(f"command_annotate={command_annotate}") 3967 run_parallel_commands([command_annotate], threads) 3968 3969 # Debug 3970 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3971 3972 # Update variants 3973 log.info( 3974 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3975 ) 3976 self.update_from_vcf(commands[command_annotate]) 3977 3978 def annotation_bcftools(self, threads: int = None) -> None: 3979 """ 3980 This function annotate with bcftools 3981 3982 :param threads: Number of threads to use 3983 :return: the value of the variable "return_value". 3984 """ 3985 3986 # DEBUG 3987 log.debug("Start annotation with bcftools databases") 3988 3989 # Threads 3990 if not threads: 3991 threads = self.get_threads() 3992 log.debug("Threads: " + str(threads)) 3993 3994 # Config 3995 config = self.get_config() 3996 log.debug("Config: " + str(config)) 3997 3998 # DEBUG 3999 delete_tmp = True 4000 if self.get_config().get("verbosity", "warning") in ["debug"]: 4001 delete_tmp = False 4002 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4003 4004 # Config - BCFTools bin command 4005 bcftools_bin_command = get_bin_command( 4006 bin="bcftools", 4007 tool="bcftools", 4008 bin_type="bin", 4009 config=config, 4010 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4011 ) 4012 if not bcftools_bin_command: 4013 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4014 log.error(msg_err) 4015 raise ValueError(msg_err) 4016 4017 # Config - BCFTools databases folders 4018 databases_folders = set( 4019 self.get_config() 4020 .get("folders", {}) 4021 .get("databases", {}) 4022 .get("annotations", ["."]) 4023 + self.get_config() 4024 .get("folders", {}) 4025 .get("databases", {}) 4026 .get("bcftools", ["."]) 4027 ) 4028 log.debug("Databases annotations: " + str(databases_folders)) 4029 4030 # Param 4031 annotations = ( 4032 self.get_param() 4033 .get("annotation", {}) 4034 .get("bcftools", {}) 4035 .get("annotations", None) 4036 ) 4037 log.debug("Annotations: " + str(annotations)) 4038 4039 # Assembly 4040 assembly = self.get_param().get( 4041 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4042 ) 4043 4044 # Data 4045 table_variants = self.get_table_variants() 4046 4047 # Check if not empty 4048 log.debug("Check if not empty") 4049 sql_query_chromosomes = ( 4050 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4051 ) 4052 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4053 if not sql_query_chromosomes_df["count"][0]: 4054 log.info(f"VCF empty") 4055 return 4056 4057 # Export in VCF 4058 log.debug("Create initial file to annotate") 4059 tmp_vcf = NamedTemporaryFile( 4060 prefix=self.get_prefix(), 4061 dir=self.get_tmp_dir(), 4062 suffix=".vcf.gz", 4063 delete=False, 4064 ) 4065 tmp_vcf_name = tmp_vcf.name 4066 4067 # VCF header 4068 vcf_reader = self.get_header() 4069 log.debug("Initial header: " + str(vcf_reader.infos)) 4070 4071 # Existing annotations 4072 for vcf_annotation in self.get_header().infos: 4073 4074 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4075 log.debug( 4076 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4077 ) 4078 4079 if annotations: 4080 4081 tmp_ann_vcf_list = [] 4082 commands = [] 4083 tmp_files = [] 4084 err_files = [] 4085 4086 for annotation in annotations: 4087 annotation_fields = annotations[annotation] 4088 4089 # Annotation Name 4090 annotation_name = os.path.basename(annotation) 4091 4092 if not annotation_fields: 4093 annotation_fields = {"INFO": None} 4094 4095 log.debug(f"Annotation '{annotation_name}'") 4096 log.debug( 4097 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4098 ) 4099 4100 # Create Database 4101 database = Database( 4102 database=annotation, 4103 databases_folders=databases_folders, 4104 assembly=assembly, 4105 ) 4106 4107 # Find files 4108 db_file = database.get_database() 4109 db_file = full_path(db_file) 4110 db_hdr_file = database.get_header_file() 4111 db_hdr_file = full_path(db_hdr_file) 4112 db_file_type = database.get_format() 4113 db_tbi_file = f"{db_file}.tbi" 4114 db_file_compressed = database.is_compressed() 4115 4116 # Check if compressed 4117 if not db_file_compressed: 4118 log.error( 4119 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4120 ) 4121 raise ValueError( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 4125 # Check if indexed 4126 if not os.path.exists(db_tbi_file): 4127 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4128 raise ValueError( 4129 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4130 ) 4131 4132 # Check index - try to create if not exists 4133 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4134 log.error("Annotation failed: database not valid") 4135 log.error(f"Annotation annotation file: {db_file}") 4136 log.error(f"Annotation annotation header: {db_hdr_file}") 4137 log.error(f"Annotation annotation index: {db_tbi_file}") 4138 raise ValueError( 4139 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4140 ) 4141 else: 4142 4143 log.debug( 4144 f"Annotation '{annotation}' - file: " 4145 + str(db_file) 4146 + " and " 4147 + str(db_hdr_file) 4148 ) 4149 4150 # Load header as VCF object 4151 db_hdr_vcf = Variants(input=db_hdr_file) 4152 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4153 log.debug( 4154 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4155 ) 4156 4157 # For all fields in database 4158 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4159 annotation_fields = { 4160 key: key for key in db_hdr_vcf_header_infos 4161 } 4162 log.debug( 4163 "Annotation database header - All annotations added: " 4164 + str(annotation_fields) 4165 ) 4166 4167 # Number of fields 4168 nb_annotation_field = 0 4169 annotation_list = [] 4170 4171 for annotation_field in annotation_fields: 4172 4173 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4174 annotation_fields_new_name = annotation_fields.get( 4175 annotation_field, annotation_field 4176 ) 4177 if not annotation_fields_new_name: 4178 annotation_fields_new_name = annotation_field 4179 4180 # Check if field is in DB and if field is not elready in input data 4181 if ( 4182 annotation_field in db_hdr_vcf.get_header().infos 4183 and annotation_fields_new_name 4184 not in self.get_header().infos 4185 ): 4186 4187 log.info( 4188 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4189 ) 4190 4191 # Add INFO field to header 4192 db_hdr_vcf_header_infos_number = ( 4193 db_hdr_vcf_header_infos[annotation_field].num or "." 4194 ) 4195 db_hdr_vcf_header_infos_type = ( 4196 db_hdr_vcf_header_infos[annotation_field].type 4197 or "String" 4198 ) 4199 db_hdr_vcf_header_infos_description = ( 4200 db_hdr_vcf_header_infos[annotation_field].desc 4201 or f"{annotation_field} description" 4202 ) 4203 db_hdr_vcf_header_infos_source = ( 4204 db_hdr_vcf_header_infos[annotation_field].source 4205 or "unknown" 4206 ) 4207 db_hdr_vcf_header_infos_version = ( 4208 db_hdr_vcf_header_infos[annotation_field].version 4209 or "unknown" 4210 ) 4211 4212 vcf_reader.infos[annotation_fields_new_name] = ( 4213 vcf.parser._Info( 4214 annotation_fields_new_name, 4215 db_hdr_vcf_header_infos_number, 4216 db_hdr_vcf_header_infos_type, 4217 db_hdr_vcf_header_infos_description, 4218 db_hdr_vcf_header_infos_source, 4219 db_hdr_vcf_header_infos_version, 4220 self.code_type_map[db_hdr_vcf_header_infos_type], 4221 ) 4222 ) 4223 4224 # annotation_list.append(annotation_field) 4225 if annotation_field != annotation_fields_new_name: 4226 annotation_list.append( 4227 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4228 ) 4229 else: 4230 annotation_list.append(annotation_field) 4231 4232 nb_annotation_field += 1 4233 4234 else: 4235 4236 if annotation_field not in db_hdr_vcf.get_header().infos: 4237 log.warning( 4238 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4239 ) 4240 if annotation_fields_new_name in self.get_header().infos: 4241 log.warning( 4242 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4243 ) 4244 4245 log.info( 4246 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4247 ) 4248 4249 annotation_infos = ",".join(annotation_list) 4250 4251 if annotation_infos != "": 4252 4253 # Protect header for bcftools (remove "#CHROM" and variants line) 4254 log.debug("Protect Header file - remove #CHROM line if exists") 4255 tmp_header_vcf = NamedTemporaryFile( 4256 prefix=self.get_prefix(), 4257 dir=self.get_tmp_dir(), 4258 suffix=".hdr", 4259 delete=False, 4260 ) 4261 tmp_header_vcf_name = tmp_header_vcf.name 4262 tmp_files.append(tmp_header_vcf_name) 4263 # Command 4264 if db_hdr_file.endswith(".gz"): 4265 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4266 else: 4267 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4268 # Run 4269 run_parallel_commands([command_extract_header], 1) 4270 4271 # Find chomosomes 4272 log.debug("Find chromosomes ") 4273 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4274 sql_query_chromosomes_df = self.get_query_to_df( 4275 sql_query_chromosomes 4276 ) 4277 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4278 4279 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4280 4281 # BED columns in the annotation file 4282 if db_file_type in ["bed"]: 4283 annotation_infos = "CHROM,POS,POS," + annotation_infos 4284 4285 for chrom in chomosomes_list: 4286 4287 # Create BED on initial VCF 4288 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4289 tmp_bed = NamedTemporaryFile( 4290 prefix=self.get_prefix(), 4291 dir=self.get_tmp_dir(), 4292 suffix=".bed", 4293 delete=False, 4294 ) 4295 tmp_bed_name = tmp_bed.name 4296 tmp_files.append(tmp_bed_name) 4297 4298 # Detecte regions 4299 log.debug( 4300 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4301 ) 4302 window = 1000000 4303 sql_query_intervals_for_bed = f""" 4304 SELECT \"#CHROM\", 4305 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4306 \"POS\"+{window} 4307 FROM {table_variants} as table_variants 4308 WHERE table_variants.\"#CHROM\" = '{chrom}' 4309 """ 4310 regions = self.conn.execute( 4311 sql_query_intervals_for_bed 4312 ).fetchall() 4313 merged_regions = merge_regions(regions) 4314 log.debug( 4315 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4316 ) 4317 4318 header = ["#CHROM", "START", "END"] 4319 with open(tmp_bed_name, "w") as f: 4320 # Write the header with tab delimiter 4321 f.write("\t".join(header) + "\n") 4322 for d in merged_regions: 4323 # Write each data row with tab delimiter 4324 f.write("\t".join(map(str, d)) + "\n") 4325 4326 # Tmp files 4327 tmp_annotation_vcf = NamedTemporaryFile( 4328 prefix=self.get_prefix(), 4329 dir=self.get_tmp_dir(), 4330 suffix=".vcf.gz", 4331 delete=False, 4332 ) 4333 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4334 tmp_files.append(tmp_annotation_vcf_name) 4335 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4336 tmp_annotation_vcf_name_err = ( 4337 tmp_annotation_vcf_name + ".err" 4338 ) 4339 err_files.append(tmp_annotation_vcf_name_err) 4340 4341 # Annotate Command 4342 log.debug( 4343 f"Annotation '{annotation}' - add bcftools command" 4344 ) 4345 4346 # Command 4347 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4348 4349 # Add command 4350 commands.append(command_annotate) 4351 4352 # if some commands 4353 if commands: 4354 4355 # Export VCF file 4356 self.export_variant_vcf( 4357 vcf_file=tmp_vcf_name, 4358 remove_info=True, 4359 add_samples=False, 4360 index=True, 4361 ) 4362 4363 # Threads 4364 # calculate threads for annotated commands 4365 if commands: 4366 threads_bcftools_annotate = round(threads / len(commands)) 4367 else: 4368 threads_bcftools_annotate = 1 4369 4370 if not threads_bcftools_annotate: 4371 threads_bcftools_annotate = 1 4372 4373 # Add threads option to bcftools commands 4374 if threads_bcftools_annotate > 1: 4375 commands_threaded = [] 4376 for command in commands: 4377 commands_threaded.append( 4378 command.replace( 4379 f"{bcftools_bin_command} annotate ", 4380 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4381 ) 4382 ) 4383 commands = commands_threaded 4384 4385 # Command annotation multithreading 4386 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4387 log.info( 4388 f"Annotation - Annotation multithreaded in " 4389 + str(len(commands)) 4390 + " commands" 4391 ) 4392 4393 run_parallel_commands(commands, threads) 4394 4395 # Merge 4396 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4397 4398 if tmp_ann_vcf_list_cmd: 4399 4400 # Tmp file 4401 tmp_annotate_vcf = NamedTemporaryFile( 4402 prefix=self.get_prefix(), 4403 dir=self.get_tmp_dir(), 4404 suffix=".vcf.gz", 4405 delete=True, 4406 ) 4407 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4408 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4409 err_files.append(tmp_annotate_vcf_name_err) 4410 4411 # Tmp file remove command 4412 tmp_files_remove_command = "" 4413 if tmp_files: 4414 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4415 4416 # Command merge 4417 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4418 log.info( 4419 f"Annotation - Annotation merging " 4420 + str(len(commands)) 4421 + " annotated files" 4422 ) 4423 log.debug(f"Annotation - merge command: {merge_command}") 4424 run_parallel_commands([merge_command], 1) 4425 4426 # Error messages 4427 log.info(f"Error/Warning messages:") 4428 error_message_command_all = [] 4429 error_message_command_warning = [] 4430 error_message_command_err = [] 4431 for err_file in err_files: 4432 with open(err_file, "r") as f: 4433 for line in f: 4434 message = line.strip() 4435 error_message_command_all.append(message) 4436 if line.startswith("[W::"): 4437 error_message_command_warning.append(message) 4438 if line.startswith("[E::"): 4439 error_message_command_err.append( 4440 f"{err_file}: " + message 4441 ) 4442 # log info 4443 for message in list( 4444 set(error_message_command_err + error_message_command_warning) 4445 ): 4446 log.info(f" {message}") 4447 # debug info 4448 for message in list(set(error_message_command_all)): 4449 log.debug(f" {message}") 4450 # failed 4451 if len(error_message_command_err): 4452 log.error("Annotation failed: Error in commands") 4453 raise ValueError("Annotation failed: Error in commands") 4454 4455 # Update variants 4456 log.info(f"Annotation - Updating...") 4457 self.update_from_vcf(tmp_annotate_vcf_name) 4458 4459 def annotation_exomiser(self, threads: int = None) -> None: 4460 """ 4461 This function annotate with Exomiser 4462 4463 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4464 - "analysis" (dict/file): 4465 Full analysis dictionnary parameters (see Exomiser docs). 4466 Either a dict, or a file in JSON or YAML format. 4467 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4468 Default : None 4469 - "preset" (string): 4470 Analysis preset (available in config folder). 4471 Used if no full "analysis" is provided. 4472 Default: "exome" 4473 - "phenopacket" (dict/file): 4474 Samples and phenotipic features parameters (see Exomiser docs). 4475 Either a dict, or a file in JSON or YAML format. 4476 Default: None 4477 - "subject" (dict): 4478 Sample parameters (see Exomiser docs). 4479 Example: 4480 "subject": 4481 { 4482 "id": "ISDBM322017", 4483 "sex": "FEMALE" 4484 } 4485 Default: None 4486 - "sample" (string): 4487 Sample name to construct "subject" section: 4488 "subject": 4489 { 4490 "id": "<sample>", 4491 "sex": "UNKNOWN_SEX" 4492 } 4493 Default: None 4494 - "phenotypicFeatures" (dict) 4495 Phenotypic features to construct "subject" section. 4496 Example: 4497 "phenotypicFeatures": 4498 [ 4499 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4500 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4501 ] 4502 - "hpo" (list) 4503 List of HPO ids as phenotypic features. 4504 Example: 4505 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4506 Default: [] 4507 - "outputOptions" (dict): 4508 Output options (see Exomiser docs). 4509 Default: 4510 "output_options" = 4511 { 4512 "outputContributingVariantsOnly": False, 4513 "numGenes": 0, 4514 "outputFormats": ["TSV_VARIANT", "VCF"] 4515 } 4516 - "transcript_source" (string): 4517 Transcript source (either "refseq", "ucsc", "ensembl") 4518 Default: "refseq" 4519 - "exomiser_to_info" (boolean): 4520 Add exomiser TSV file columns as INFO fields in VCF. 4521 Default: False 4522 - "release" (string): 4523 Exomise database release. 4524 If not exists, database release will be downloaded (take a while). 4525 Default: None (provided by application.properties configuration file) 4526 - "exomiser_application_properties" (file): 4527 Exomiser configuration file (see Exomiser docs). 4528 Useful to automatically download databases (especially for specific genome databases). 4529 4530 Notes: 4531 - If no sample in parameters, first sample in VCF will be chosen 4532 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4533 4534 :param threads: The number of threads to use 4535 :return: None. 4536 """ 4537 4538 # DEBUG 4539 log.debug("Start annotation with Exomiser databases") 4540 4541 # Threads 4542 if not threads: 4543 threads = self.get_threads() 4544 log.debug("Threads: " + str(threads)) 4545 4546 # Config 4547 config = self.get_config() 4548 log.debug("Config: " + str(config)) 4549 4550 # Config - Folders - Databases 4551 databases_folders = ( 4552 config.get("folders", {}) 4553 .get("databases", {}) 4554 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4555 ) 4556 databases_folders = full_path(databases_folders) 4557 if not os.path.exists(databases_folders): 4558 log.error(f"Databases annotations: {databases_folders} NOT found") 4559 log.debug("Databases annotations: " + str(databases_folders)) 4560 4561 # Config - Exomiser 4562 exomiser_bin_command = get_bin_command( 4563 bin="exomiser-cli*.jar", 4564 tool="exomiser", 4565 bin_type="jar", 4566 config=config, 4567 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4568 ) 4569 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4570 if not exomiser_bin_command: 4571 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4572 log.error(msg_err) 4573 raise ValueError(msg_err) 4574 4575 # Param 4576 param = self.get_param() 4577 log.debug("Param: " + str(param)) 4578 4579 # Param - Exomiser 4580 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4581 log.debug(f"Param Exomiser: {param_exomiser}") 4582 4583 # Param - Assembly 4584 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4585 log.debug("Assembly: " + str(assembly)) 4586 4587 # Data 4588 table_variants = self.get_table_variants() 4589 4590 # Check if not empty 4591 log.debug("Check if not empty") 4592 sql_query_chromosomes = ( 4593 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4594 ) 4595 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4596 log.info(f"VCF empty") 4597 return False 4598 4599 # VCF header 4600 vcf_reader = self.get_header() 4601 log.debug("Initial header: " + str(vcf_reader.infos)) 4602 4603 # Samples 4604 samples = self.get_header_sample_list() 4605 if not samples: 4606 log.error("No Samples in VCF") 4607 return False 4608 log.debug(f"Samples: {samples}") 4609 4610 # Memory limit 4611 memory_limit = self.get_memory("8G") 4612 log.debug(f"memory_limit: {memory_limit}") 4613 4614 # Exomiser java options 4615 exomiser_java_options = ( 4616 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4617 ) 4618 log.debug(f"Exomiser java options: {exomiser_java_options}") 4619 4620 # Download Exomiser (if not exists) 4621 exomiser_release = param_exomiser.get("release", None) 4622 exomiser_application_properties = param_exomiser.get( 4623 "exomiser_application_properties", None 4624 ) 4625 databases_download_exomiser( 4626 assemblies=[assembly], 4627 exomiser_folder=databases_folders, 4628 exomiser_release=exomiser_release, 4629 exomiser_phenotype_release=exomiser_release, 4630 exomiser_application_properties=exomiser_application_properties, 4631 ) 4632 4633 # Force annotation 4634 force_update_annotation = True 4635 4636 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4637 log.debug("Start annotation Exomiser") 4638 4639 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4640 4641 # tmp_dir = "/tmp/exomiser" 4642 4643 ### ANALYSIS ### 4644 ################ 4645 4646 # Create analysis.json through analysis dict 4647 # either analysis in param or by default 4648 # depending on preset exome/genome) 4649 4650 # Init analysis dict 4651 param_exomiser_analysis_dict = {} 4652 4653 # analysis from param 4654 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4655 param_exomiser_analysis = full_path(param_exomiser_analysis) 4656 4657 # If analysis in param -> load anlaysis json 4658 if param_exomiser_analysis: 4659 4660 # If param analysis is a file and exists 4661 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4662 param_exomiser_analysis 4663 ): 4664 # Load analysis file into analysis dict (either yaml or json) 4665 with open(param_exomiser_analysis) as json_file: 4666 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4667 4668 # If param analysis is a dict 4669 elif isinstance(param_exomiser_analysis, dict): 4670 # Load analysis dict into analysis dict (either yaml or json) 4671 param_exomiser_analysis_dict = param_exomiser_analysis 4672 4673 # Error analysis type 4674 else: 4675 log.error(f"Analysis type unknown. Check param file.") 4676 raise ValueError(f"Analysis type unknown. Check param file.") 4677 4678 # Case no input analysis config file/dict 4679 # Use preset (exome/genome) to open default config file 4680 if not param_exomiser_analysis_dict: 4681 4682 # default preset 4683 default_preset = "exome" 4684 4685 # Get param preset or default preset 4686 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4687 4688 # Try to find if preset is a file 4689 if os.path.exists(param_exomiser_preset): 4690 # Preset file is provided in full path 4691 param_exomiser_analysis_default_config_file = ( 4692 param_exomiser_preset 4693 ) 4694 # elif os.path.exists(full_path(param_exomiser_preset)): 4695 # # Preset file is provided in full path 4696 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4697 elif os.path.exists( 4698 os.path.join(folder_config, param_exomiser_preset) 4699 ): 4700 # Preset file is provided a basename in config folder (can be a path with subfolders) 4701 param_exomiser_analysis_default_config_file = os.path.join( 4702 folder_config, param_exomiser_preset 4703 ) 4704 else: 4705 # Construct preset file 4706 param_exomiser_analysis_default_config_file = os.path.join( 4707 folder_config, 4708 f"preset-{param_exomiser_preset}-analysis.json", 4709 ) 4710 4711 # If preset file exists 4712 param_exomiser_analysis_default_config_file = full_path( 4713 param_exomiser_analysis_default_config_file 4714 ) 4715 if os.path.exists(param_exomiser_analysis_default_config_file): 4716 # Load prest file into analysis dict (either yaml or json) 4717 with open( 4718 param_exomiser_analysis_default_config_file 4719 ) as json_file: 4720 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4721 json_file 4722 ) 4723 4724 # Error preset file 4725 else: 4726 log.error( 4727 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4728 ) 4729 raise ValueError( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 4733 # If no analysis dict created 4734 if not param_exomiser_analysis_dict: 4735 log.error(f"No analysis config") 4736 raise ValueError(f"No analysis config") 4737 4738 # Log 4739 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4740 4741 ### PHENOPACKET ### 4742 ################### 4743 4744 # If no PhenoPacket in analysis dict -> check in param 4745 if "phenopacket" not in param_exomiser_analysis_dict: 4746 4747 # If PhenoPacket in param -> load anlaysis json 4748 if param_exomiser.get("phenopacket", None): 4749 4750 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4751 param_exomiser_phenopacket = full_path( 4752 param_exomiser_phenopacket 4753 ) 4754 4755 # If param phenopacket is a file and exists 4756 if isinstance( 4757 param_exomiser_phenopacket, str 4758 ) and os.path.exists(param_exomiser_phenopacket): 4759 # Load phenopacket file into analysis dict (either yaml or json) 4760 with open(param_exomiser_phenopacket) as json_file: 4761 param_exomiser_analysis_dict["phenopacket"] = ( 4762 yaml.safe_load(json_file) 4763 ) 4764 4765 # If param phenopacket is a dict 4766 elif isinstance(param_exomiser_phenopacket, dict): 4767 # Load phenopacket dict into analysis dict (either yaml or json) 4768 param_exomiser_analysis_dict["phenopacket"] = ( 4769 param_exomiser_phenopacket 4770 ) 4771 4772 # Error phenopacket type 4773 else: 4774 log.error(f"Phenopacket type unknown. Check param file.") 4775 raise ValueError( 4776 f"Phenopacket type unknown. Check param file." 4777 ) 4778 4779 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4780 if "phenopacket" not in param_exomiser_analysis_dict: 4781 4782 # Init PhenoPacket 4783 param_exomiser_analysis_dict["phenopacket"] = { 4784 "id": "analysis", 4785 "proband": {}, 4786 } 4787 4788 ### Add subject ### 4789 4790 # If subject exists 4791 param_exomiser_subject = param_exomiser.get("subject", {}) 4792 4793 # If subject not exists -> found sample ID 4794 if not param_exomiser_subject: 4795 4796 # Found sample ID in param 4797 sample = param_exomiser.get("sample", None) 4798 4799 # Find sample ID (first sample) 4800 if not sample: 4801 sample_list = self.get_header_sample_list() 4802 if len(sample_list) > 0: 4803 sample = sample_list[0] 4804 else: 4805 log.error(f"No sample found") 4806 raise ValueError(f"No sample found") 4807 4808 # Create subject 4809 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4810 4811 # Add to dict 4812 param_exomiser_analysis_dict["phenopacket"][ 4813 "subject" 4814 ] = param_exomiser_subject 4815 4816 ### Add "phenotypicFeatures" ### 4817 4818 # If phenotypicFeatures exists 4819 param_exomiser_phenotypicfeatures = param_exomiser.get( 4820 "phenotypicFeatures", [] 4821 ) 4822 4823 # If phenotypicFeatures not exists -> Try to infer from hpo list 4824 if not param_exomiser_phenotypicfeatures: 4825 4826 # Found HPO in param 4827 param_exomiser_hpo = param_exomiser.get("hpo", []) 4828 4829 # Split HPO if list in string format separated by comma 4830 if isinstance(param_exomiser_hpo, str): 4831 param_exomiser_hpo = param_exomiser_hpo.split(",") 4832 4833 # Create HPO list 4834 for hpo in param_exomiser_hpo: 4835 hpo_clean = re.sub("[^0-9]", "", hpo) 4836 param_exomiser_phenotypicfeatures.append( 4837 { 4838 "type": { 4839 "id": f"HP:{hpo_clean}", 4840 "label": f"HP:{hpo_clean}", 4841 } 4842 } 4843 ) 4844 4845 # Add to dict 4846 param_exomiser_analysis_dict["phenopacket"][ 4847 "phenotypicFeatures" 4848 ] = param_exomiser_phenotypicfeatures 4849 4850 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4851 if not param_exomiser_phenotypicfeatures: 4852 for step in param_exomiser_analysis_dict.get( 4853 "analysis", {} 4854 ).get("steps", []): 4855 if "hiPhivePrioritiser" in step: 4856 param_exomiser_analysis_dict.get("analysis", {}).get( 4857 "steps", [] 4858 ).remove(step) 4859 4860 ### Add Input File ### 4861 4862 # Initial file name and htsFiles 4863 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4864 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4865 { 4866 "uri": tmp_vcf_name, 4867 "htsFormat": "VCF", 4868 "genomeAssembly": assembly, 4869 } 4870 ] 4871 4872 ### Add metaData ### 4873 4874 # If metaData not in analysis dict 4875 if "metaData" not in param_exomiser_analysis_dict: 4876 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4877 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4878 "createdBy": "howard", 4879 "phenopacketSchemaVersion": 1, 4880 } 4881 4882 ### OutputOptions ### 4883 4884 # Init output result folder 4885 output_results = os.path.join(tmp_dir, "results") 4886 4887 # If no outputOptions in analysis dict 4888 if "outputOptions" not in param_exomiser_analysis_dict: 4889 4890 # default output formats 4891 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4892 4893 # Get outputOptions in param 4894 output_options = param_exomiser.get("outputOptions", None) 4895 4896 # If no output_options in param -> check 4897 if not output_options: 4898 output_options = { 4899 "outputContributingVariantsOnly": False, 4900 "numGenes": 0, 4901 "outputFormats": defaut_output_formats, 4902 } 4903 4904 # Replace outputDirectory in output options 4905 output_options["outputDirectory"] = output_results 4906 output_options["outputFileName"] = "howard" 4907 4908 # Add outputOptions in analysis dict 4909 param_exomiser_analysis_dict["outputOptions"] = output_options 4910 4911 else: 4912 4913 # Replace output_results and output format (if exists in param) 4914 param_exomiser_analysis_dict["outputOptions"][ 4915 "outputDirectory" 4916 ] = output_results 4917 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4918 list( 4919 set( 4920 param_exomiser_analysis_dict.get( 4921 "outputOptions", {} 4922 ).get("outputFormats", []) 4923 + ["TSV_VARIANT", "VCF"] 4924 ) 4925 ) 4926 ) 4927 4928 # log 4929 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4930 4931 ### ANALYSIS FILE ### 4932 ##################### 4933 4934 ### Full JSON analysis config file ### 4935 4936 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4937 with open(exomiser_analysis, "w") as fp: 4938 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4939 4940 ### SPLIT analysis and sample config files 4941 4942 # Splitted analysis dict 4943 param_exomiser_analysis_dict_for_split = ( 4944 param_exomiser_analysis_dict.copy() 4945 ) 4946 4947 # Phenopacket JSON file 4948 exomiser_analysis_phenopacket = os.path.join( 4949 tmp_dir, "analysis_phenopacket.json" 4950 ) 4951 with open(exomiser_analysis_phenopacket, "w") as fp: 4952 json.dump( 4953 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4954 fp, 4955 indent=4, 4956 ) 4957 4958 # Analysis JSON file without Phenopacket parameters 4959 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4960 exomiser_analysis_analysis = os.path.join( 4961 tmp_dir, "analysis_analysis.json" 4962 ) 4963 with open(exomiser_analysis_analysis, "w") as fp: 4964 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4965 4966 ### INITAL VCF file ### 4967 ####################### 4968 4969 ### Create list of samples to use and include inti initial VCF file #### 4970 4971 # Subject (main sample) 4972 # Get sample ID in analysis dict 4973 sample_subject = ( 4974 param_exomiser_analysis_dict.get("phenopacket", {}) 4975 .get("subject", {}) 4976 .get("id", None) 4977 ) 4978 sample_proband = ( 4979 param_exomiser_analysis_dict.get("phenopacket", {}) 4980 .get("proband", {}) 4981 .get("subject", {}) 4982 .get("id", None) 4983 ) 4984 sample = [] 4985 if sample_subject: 4986 sample.append(sample_subject) 4987 if sample_proband: 4988 sample.append(sample_proband) 4989 4990 # Get sample ID within Pedigree 4991 pedigree_persons_list = ( 4992 param_exomiser_analysis_dict.get("phenopacket", {}) 4993 .get("pedigree", {}) 4994 .get("persons", {}) 4995 ) 4996 4997 # Create list with all sample ID in pedigree (if exists) 4998 pedigree_persons = [] 4999 for person in pedigree_persons_list: 5000 pedigree_persons.append(person.get("individualId")) 5001 5002 # Concat subject sample ID and samples ID in pedigreesamples 5003 samples = list(set(sample + pedigree_persons)) 5004 5005 # Check if sample list is not empty 5006 if not samples: 5007 log.error(f"No samples found") 5008 raise ValueError(f"No samples found") 5009 5010 # Create VCF with sample (either sample in param or first one by default) 5011 # Export VCF file 5012 self.export_variant_vcf( 5013 vcf_file=tmp_vcf_name, 5014 remove_info=True, 5015 add_samples=True, 5016 list_samples=samples, 5017 index=False, 5018 ) 5019 5020 ### Execute Exomiser ### 5021 ######################## 5022 5023 # Init command 5024 exomiser_command = "" 5025 5026 # Command exomiser options 5027 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5028 5029 # Release 5030 exomiser_release = param_exomiser.get("release", None) 5031 if exomiser_release: 5032 # phenotype data version 5033 exomiser_options += ( 5034 f" --exomiser.phenotype.data-version={exomiser_release} " 5035 ) 5036 # data version 5037 exomiser_options += ( 5038 f" --exomiser.{assembly}.data-version={exomiser_release} " 5039 ) 5040 # variant white list 5041 variant_white_list_file = ( 5042 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5043 ) 5044 if os.path.exists( 5045 os.path.join( 5046 databases_folders, assembly, variant_white_list_file 5047 ) 5048 ): 5049 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5050 5051 # transcript_source 5052 transcript_source = param_exomiser.get( 5053 "transcript_source", None 5054 ) # ucsc, refseq, ensembl 5055 if transcript_source: 5056 exomiser_options += ( 5057 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5058 ) 5059 5060 # If analysis contain proband param 5061 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5062 "proband", {} 5063 ): 5064 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5065 5066 # If no proband (usually uniq sample) 5067 else: 5068 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5069 5070 # Log 5071 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5072 5073 # Run command 5074 result = subprocess.call( 5075 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5076 ) 5077 if result: 5078 log.error("Exomiser command failed") 5079 raise ValueError("Exomiser command failed") 5080 5081 ### RESULTS ### 5082 ############### 5083 5084 ### Annotate with TSV fields ### 5085 5086 # Init result tsv file 5087 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5088 5089 # Init result tsv file 5090 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5091 5092 # Parse TSV file and explode columns in INFO field 5093 if exomiser_to_info and os.path.exists(output_results_tsv): 5094 5095 # Log 5096 log.debug("Exomiser columns to VCF INFO field") 5097 5098 # Retrieve columns and types 5099 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5100 output_results_tsv_df = self.get_query_to_df(query) 5101 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5102 5103 # Init concat fields for update 5104 sql_query_update_concat_fields = [] 5105 5106 # Fields to avoid 5107 fields_to_avoid = [ 5108 "CONTIG", 5109 "START", 5110 "END", 5111 "REF", 5112 "ALT", 5113 "QUAL", 5114 "FILTER", 5115 "GENOTYPE", 5116 ] 5117 5118 # List all columns to add into header 5119 for header_column in output_results_tsv_columns: 5120 5121 # If header column is enable 5122 if header_column not in fields_to_avoid: 5123 5124 # Header info type 5125 header_info_type = "String" 5126 header_column_df = output_results_tsv_df[header_column] 5127 header_column_df_dtype = header_column_df.dtype 5128 if header_column_df_dtype == object: 5129 if ( 5130 pd.to_numeric(header_column_df, errors="coerce") 5131 .notnull() 5132 .all() 5133 ): 5134 header_info_type = "Float" 5135 else: 5136 header_info_type = "Integer" 5137 5138 # Header info 5139 characters_to_validate = ["-"] 5140 pattern = "[" + "".join(characters_to_validate) + "]" 5141 header_info_name = re.sub( 5142 pattern, 5143 "_", 5144 f"Exomiser_{header_column}".replace("#", ""), 5145 ) 5146 header_info_number = "." 5147 header_info_description = ( 5148 f"Exomiser {header_column} annotation" 5149 ) 5150 header_info_source = "Exomiser" 5151 header_info_version = "unknown" 5152 header_info_code = CODE_TYPE_MAP[header_info_type] 5153 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5154 header_info_name, 5155 header_info_number, 5156 header_info_type, 5157 header_info_description, 5158 header_info_source, 5159 header_info_version, 5160 header_info_code, 5161 ) 5162 5163 # Add field to add for update to concat fields 5164 sql_query_update_concat_fields.append( 5165 f""" 5166 CASE 5167 WHEN table_parquet."{header_column}" NOT IN ('','.') 5168 THEN concat( 5169 '{header_info_name}=', 5170 table_parquet."{header_column}", 5171 ';' 5172 ) 5173 5174 ELSE '' 5175 END 5176 """ 5177 ) 5178 5179 # Update query 5180 sql_query_update = f""" 5181 UPDATE {table_variants} as table_variants 5182 SET INFO = concat( 5183 CASE 5184 WHEN INFO NOT IN ('', '.') 5185 THEN INFO 5186 ELSE '' 5187 END, 5188 CASE 5189 WHEN table_variants.INFO NOT IN ('','.') 5190 THEN ';' 5191 ELSE '' 5192 END, 5193 ( 5194 SELECT 5195 concat( 5196 {",".join(sql_query_update_concat_fields)} 5197 ) 5198 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5199 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5200 AND table_parquet.\"START\" = table_variants.\"POS\" 5201 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5202 AND table_parquet.\"REF\" = table_variants.\"REF\" 5203 ) 5204 ) 5205 ; 5206 """ 5207 5208 # Update 5209 self.conn.execute(sql_query_update) 5210 5211 ### Annotate with VCF INFO field ### 5212 5213 # Init result VCF file 5214 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5215 5216 # If VCF exists 5217 if os.path.exists(output_results_vcf): 5218 5219 # Log 5220 log.debug("Exomiser result VCF update variants") 5221 5222 # Find Exomiser INFO field annotation in header 5223 with gzip.open(output_results_vcf, "rt") as f: 5224 header_list = self.read_vcf_header(f) 5225 exomiser_vcf_header = vcf.Reader( 5226 io.StringIO("\n".join(header_list)) 5227 ) 5228 5229 # Add annotation INFO field to header 5230 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5231 5232 # Update variants with VCF 5233 self.update_from_vcf(output_results_vcf) 5234 5235 return True 5236 5237 def annotation_snpeff(self, threads: int = None) -> None: 5238 """ 5239 This function annotate with snpEff 5240 5241 :param threads: The number of threads to use 5242 :return: the value of the variable "return_value". 5243 """ 5244 5245 # DEBUG 5246 log.debug("Start annotation with snpeff databases") 5247 5248 # Threads 5249 if not threads: 5250 threads = self.get_threads() 5251 log.debug("Threads: " + str(threads)) 5252 5253 # DEBUG 5254 delete_tmp = True 5255 if self.get_config().get("verbosity", "warning") in ["debug"]: 5256 delete_tmp = False 5257 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5258 5259 # Config 5260 config = self.get_config() 5261 log.debug("Config: " + str(config)) 5262 5263 # Config - Folders - Databases 5264 databases_folders = ( 5265 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5266 ) 5267 log.debug("Databases annotations: " + str(databases_folders)) 5268 5269 # Config - snpEff bin command 5270 snpeff_bin_command = get_bin_command( 5271 bin="snpEff.jar", 5272 tool="snpeff", 5273 bin_type="jar", 5274 config=config, 5275 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5276 ) 5277 if not snpeff_bin_command: 5278 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5279 log.error(msg_err) 5280 raise ValueError(msg_err) 5281 5282 # Config - snpEff databases 5283 snpeff_databases = ( 5284 config.get("folders", {}) 5285 .get("databases", {}) 5286 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5287 ) 5288 snpeff_databases = full_path(snpeff_databases) 5289 if snpeff_databases is not None and snpeff_databases != "": 5290 log.debug(f"Create snpEff databases folder") 5291 if not os.path.exists(snpeff_databases): 5292 os.makedirs(snpeff_databases) 5293 5294 # Param 5295 param = self.get_param() 5296 log.debug("Param: " + str(param)) 5297 5298 # Param 5299 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5300 log.debug("Options: " + str(options)) 5301 5302 # Param - Assembly 5303 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5304 5305 # Param - Options 5306 snpeff_options = ( 5307 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5308 ) 5309 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5310 snpeff_csvstats = ( 5311 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5312 ) 5313 if snpeff_stats: 5314 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5315 snpeff_stats = full_path(snpeff_stats) 5316 snpeff_options += f" -stats {snpeff_stats}" 5317 if snpeff_csvstats: 5318 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5319 snpeff_csvstats = full_path(snpeff_csvstats) 5320 snpeff_options += f" -csvStats {snpeff_csvstats}" 5321 5322 # Data 5323 table_variants = self.get_table_variants() 5324 5325 # Check if not empty 5326 log.debug("Check if not empty") 5327 sql_query_chromosomes = ( 5328 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5329 ) 5330 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5331 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5332 log.info(f"VCF empty") 5333 return 5334 5335 # Export in VCF 5336 log.debug("Create initial file to annotate") 5337 tmp_vcf = NamedTemporaryFile( 5338 prefix=self.get_prefix(), 5339 dir=self.get_tmp_dir(), 5340 suffix=".vcf.gz", 5341 delete=True, 5342 ) 5343 tmp_vcf_name = tmp_vcf.name 5344 5345 # VCF header 5346 vcf_reader = self.get_header() 5347 log.debug("Initial header: " + str(vcf_reader.infos)) 5348 5349 # Existing annotations 5350 for vcf_annotation in self.get_header().infos: 5351 5352 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5353 log.debug( 5354 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5355 ) 5356 5357 # Memory limit 5358 # if config.get("memory", None): 5359 # memory_limit = config.get("memory", "8G") 5360 # else: 5361 # memory_limit = "8G" 5362 memory_limit = self.get_memory("8G") 5363 log.debug(f"memory_limit: {memory_limit}") 5364 5365 # snpEff java options 5366 snpeff_java_options = ( 5367 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5368 ) 5369 log.debug(f"Exomiser java options: {snpeff_java_options}") 5370 5371 force_update_annotation = True 5372 5373 if "ANN" not in self.get_header().infos or force_update_annotation: 5374 5375 # Check snpEff database 5376 log.debug(f"Check snpEff databases {[assembly]}") 5377 databases_download_snpeff( 5378 folder=snpeff_databases, assemblies=[assembly], config=config 5379 ) 5380 5381 # Export VCF file 5382 self.export_variant_vcf( 5383 vcf_file=tmp_vcf_name, 5384 remove_info=True, 5385 add_samples=False, 5386 index=True, 5387 ) 5388 5389 # Tmp file 5390 err_files = [] 5391 tmp_annotate_vcf = NamedTemporaryFile( 5392 prefix=self.get_prefix(), 5393 dir=self.get_tmp_dir(), 5394 suffix=".vcf", 5395 delete=False, 5396 ) 5397 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5398 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5399 err_files.append(tmp_annotate_vcf_name_err) 5400 5401 # Command 5402 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5403 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5404 run_parallel_commands([snpeff_command], 1) 5405 5406 # Error messages 5407 log.info(f"Error/Warning messages:") 5408 error_message_command_all = [] 5409 error_message_command_warning = [] 5410 error_message_command_err = [] 5411 for err_file in err_files: 5412 with open(err_file, "r") as f: 5413 for line in f: 5414 message = line.strip() 5415 error_message_command_all.append(message) 5416 if line.startswith("[W::"): 5417 error_message_command_warning.append(message) 5418 if line.startswith("[E::"): 5419 error_message_command_err.append(f"{err_file}: " + message) 5420 # log info 5421 for message in list( 5422 set(error_message_command_err + error_message_command_warning) 5423 ): 5424 log.info(f" {message}") 5425 # debug info 5426 for message in list(set(error_message_command_all)): 5427 log.debug(f" {message}") 5428 # failed 5429 if len(error_message_command_err): 5430 log.error("Annotation failed: Error in commands") 5431 raise ValueError("Annotation failed: Error in commands") 5432 5433 # Find annotation in header 5434 with open(tmp_annotate_vcf_name, "rt") as f: 5435 header_list = self.read_vcf_header(f) 5436 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5437 5438 for ann in annovar_vcf_header.infos: 5439 if ann not in self.get_header().infos: 5440 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5441 5442 # Update variants 5443 log.info(f"Annotation - Updating...") 5444 self.update_from_vcf(tmp_annotate_vcf_name) 5445 5446 else: 5447 if "ANN" in self.get_header().infos: 5448 log.debug(f"Existing snpEff annotations in VCF") 5449 if force_update_annotation: 5450 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5451 5452 def annotation_annovar(self, threads: int = None) -> None: 5453 """ 5454 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5455 annotations 5456 5457 :param threads: number of threads to use 5458 :return: the value of the variable "return_value". 5459 """ 5460 5461 # DEBUG 5462 log.debug("Start annotation with Annovar databases") 5463 5464 # Threads 5465 if not threads: 5466 threads = self.get_threads() 5467 log.debug("Threads: " + str(threads)) 5468 5469 # Tmp en Err files 5470 tmp_files = [] 5471 err_files = [] 5472 5473 # DEBUG 5474 delete_tmp = True 5475 if self.get_config().get("verbosity", "warning") in ["debug"]: 5476 delete_tmp = False 5477 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5478 5479 # Config 5480 config = self.get_config() 5481 log.debug("Config: " + str(config)) 5482 5483 # Config - Folders - Databases 5484 databases_folders = ( 5485 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5486 ) 5487 log.debug("Databases annotations: " + str(databases_folders)) 5488 5489 # Config - annovar bin command 5490 annovar_bin_command = get_bin_command( 5491 bin="table_annovar.pl", 5492 tool="annovar", 5493 bin_type="perl", 5494 config=config, 5495 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5496 ) 5497 if not annovar_bin_command: 5498 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5499 log.error(msg_err) 5500 raise ValueError(msg_err) 5501 5502 # Config - BCFTools bin command 5503 bcftools_bin_command = get_bin_command( 5504 bin="bcftools", 5505 tool="bcftools", 5506 bin_type="bin", 5507 config=config, 5508 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5509 ) 5510 if not bcftools_bin_command: 5511 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5512 log.error(msg_err) 5513 raise ValueError(msg_err) 5514 5515 # Config - annovar databases 5516 annovar_databases = ( 5517 config.get("folders", {}) 5518 .get("databases", {}) 5519 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5520 ) 5521 if annovar_databases is not None: 5522 if isinstance(annovar_databases, list): 5523 annovar_databases = full_path(annovar_databases[0]) 5524 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5525 annovar_databases = full_path(annovar_databases) 5526 if not os.path.exists(annovar_databases): 5527 log.info(f"Annovar databases folder '{annovar_databases}' created") 5528 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5529 else: 5530 msg_err = f"Annovar databases configuration failed" 5531 log.error(msg_err) 5532 raise ValueError(msg_err) 5533 5534 # Param 5535 param = self.get_param() 5536 log.debug("Param: " + str(param)) 5537 5538 # Param - options 5539 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5540 log.debug("Options: " + str(options)) 5541 5542 # Param - annotations 5543 annotations = ( 5544 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5545 ) 5546 log.debug("Annotations: " + str(annotations)) 5547 5548 # Param - Assembly 5549 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5550 5551 # Annovar database assembly 5552 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5553 if annovar_databases_assembly != "" and not os.path.exists( 5554 annovar_databases_assembly 5555 ): 5556 os.makedirs(annovar_databases_assembly) 5557 5558 # Data 5559 table_variants = self.get_table_variants() 5560 5561 # Check if not empty 5562 log.debug("Check if not empty") 5563 sql_query_chromosomes = ( 5564 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5565 ) 5566 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5567 if not sql_query_chromosomes_df["count"][0]: 5568 log.info(f"VCF empty") 5569 return 5570 5571 # VCF header 5572 vcf_reader = self.get_header() 5573 log.debug("Initial header: " + str(vcf_reader.infos)) 5574 5575 # Existing annotations 5576 for vcf_annotation in self.get_header().infos: 5577 5578 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5579 log.debug( 5580 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5581 ) 5582 5583 force_update_annotation = True 5584 5585 if annotations: 5586 5587 commands = [] 5588 tmp_annotates_vcf_name_list = [] 5589 5590 # Export in VCF 5591 log.debug("Create initial file to annotate") 5592 tmp_vcf = NamedTemporaryFile( 5593 prefix=self.get_prefix(), 5594 dir=self.get_tmp_dir(), 5595 suffix=".vcf.gz", 5596 delete=False, 5597 ) 5598 tmp_vcf_name = tmp_vcf.name 5599 tmp_files.append(tmp_vcf_name) 5600 tmp_files.append(tmp_vcf_name + ".tbi") 5601 5602 # Export VCF file 5603 self.export_variant_vcf( 5604 vcf_file=tmp_vcf_name, 5605 remove_info=".", 5606 add_samples=False, 5607 index=True, 5608 ) 5609 5610 # Create file for field rename 5611 log.debug("Create file for field rename") 5612 tmp_rename = NamedTemporaryFile( 5613 prefix=self.get_prefix(), 5614 dir=self.get_tmp_dir(), 5615 suffix=".rename", 5616 delete=False, 5617 ) 5618 tmp_rename_name = tmp_rename.name 5619 tmp_files.append(tmp_rename_name) 5620 5621 # Check Annovar database 5622 log.debug( 5623 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5624 ) 5625 databases_download_annovar( 5626 folder=annovar_databases, 5627 files=list(annotations.keys()), 5628 assemblies=[assembly], 5629 ) 5630 5631 for annotation in annotations: 5632 annotation_fields = annotations[annotation] 5633 5634 if not annotation_fields: 5635 annotation_fields = {"INFO": None} 5636 5637 log.info(f"Annotations Annovar - database '{annotation}'") 5638 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5639 5640 # Tmp file for annovar 5641 err_files = [] 5642 tmp_annotate_vcf_directory = TemporaryDirectory( 5643 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5644 ) 5645 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5646 tmp_annotate_vcf_name_annovar = ( 5647 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5648 ) 5649 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5650 err_files.append(tmp_annotate_vcf_name_err) 5651 tmp_files.append(tmp_annotate_vcf_name_err) 5652 5653 # Tmp file final vcf annotated by annovar 5654 tmp_annotate_vcf = NamedTemporaryFile( 5655 prefix=self.get_prefix(), 5656 dir=self.get_tmp_dir(), 5657 suffix=".vcf.gz", 5658 delete=False, 5659 ) 5660 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5661 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5662 tmp_files.append(tmp_annotate_vcf_name) 5663 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5664 5665 # Number of fields 5666 annotation_list = [] 5667 annotation_renamed_list = [] 5668 5669 for annotation_field in annotation_fields: 5670 5671 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5672 annotation_fields_new_name = annotation_fields.get( 5673 annotation_field, annotation_field 5674 ) 5675 if not annotation_fields_new_name: 5676 annotation_fields_new_name = annotation_field 5677 5678 if ( 5679 force_update_annotation 5680 or annotation_fields_new_name not in self.get_header().infos 5681 ): 5682 annotation_list.append(annotation_field) 5683 annotation_renamed_list.append(annotation_fields_new_name) 5684 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5685 log.warning( 5686 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5687 ) 5688 5689 # Add rename info 5690 run_parallel_commands( 5691 [ 5692 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5693 ], 5694 1, 5695 ) 5696 5697 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5698 log.debug("annotation_list: " + str(annotation_list)) 5699 5700 # protocol 5701 protocol = annotation 5702 5703 # argument 5704 argument = "" 5705 5706 # operation 5707 operation = "f" 5708 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5709 "ensGene" 5710 ): 5711 operation = "g" 5712 if options.get("genebase", None): 5713 argument = f"""'{options.get("genebase","")}'""" 5714 elif annotation in ["cytoBand"]: 5715 operation = "r" 5716 5717 # argument option 5718 argument_option = "" 5719 if argument != "": 5720 argument_option = " --argument " + argument 5721 5722 # command options 5723 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5724 for option in options: 5725 if option not in ["genebase"]: 5726 command_options += f""" --{option}={options[option]}""" 5727 5728 # Command 5729 5730 # Command - Annovar 5731 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5732 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5733 5734 # Command - start pipe 5735 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5736 5737 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5738 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5739 5740 # Command - Special characters (refGene annotation) 5741 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5742 5743 # Command - Clean empty fields (with value ".") 5744 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5745 5746 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5747 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5748 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5749 # for ann in annotation_renamed_list: 5750 for ann in annotation_list: 5751 annovar_fields_to_keep.append(f"^INFO/{ann}") 5752 5753 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5754 5755 # Command - indexing 5756 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5757 5758 log.debug(f"Annotation - Annovar command: {command_annovar}") 5759 run_parallel_commands([command_annovar], 1) 5760 5761 # Error messages 5762 log.info(f"Error/Warning messages:") 5763 error_message_command_all = [] 5764 error_message_command_warning = [] 5765 error_message_command_err = [] 5766 for err_file in err_files: 5767 with open(err_file, "r") as f: 5768 for line in f: 5769 message = line.strip() 5770 error_message_command_all.append(message) 5771 if line.startswith("[W::") or line.startswith("WARNING"): 5772 error_message_command_warning.append(message) 5773 if line.startswith("[E::") or line.startswith("ERROR"): 5774 error_message_command_err.append( 5775 f"{err_file}: " + message 5776 ) 5777 # log info 5778 for message in list( 5779 set(error_message_command_err + error_message_command_warning) 5780 ): 5781 log.info(f" {message}") 5782 # debug info 5783 for message in list(set(error_message_command_all)): 5784 log.debug(f" {message}") 5785 # failed 5786 if len(error_message_command_err): 5787 log.error("Annotation failed: Error in commands") 5788 raise ValueError("Annotation failed: Error in commands") 5789 5790 if tmp_annotates_vcf_name_list: 5791 5792 # List of annotated files 5793 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5794 5795 # Tmp file 5796 tmp_annotate_vcf = NamedTemporaryFile( 5797 prefix=self.get_prefix(), 5798 dir=self.get_tmp_dir(), 5799 suffix=".vcf.gz", 5800 delete=False, 5801 ) 5802 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5803 tmp_files.append(tmp_annotate_vcf_name) 5804 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5805 err_files.append(tmp_annotate_vcf_name_err) 5806 tmp_files.append(tmp_annotate_vcf_name_err) 5807 5808 # Command merge 5809 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5810 log.info( 5811 f"Annotation Annovar - Annotation merging " 5812 + str(len(tmp_annotates_vcf_name_list)) 5813 + " annotated files" 5814 ) 5815 log.debug(f"Annotation - merge command: {merge_command}") 5816 run_parallel_commands([merge_command], 1) 5817 5818 # Find annotation in header 5819 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5820 header_list = self.read_vcf_header(f) 5821 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5822 5823 for ann in annovar_vcf_header.infos: 5824 if ann not in self.get_header().infos: 5825 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5826 5827 # Update variants 5828 log.info(f"Annotation Annovar - Updating...") 5829 self.update_from_vcf(tmp_annotate_vcf_name) 5830 5831 # Clean files 5832 # Tmp file remove command 5833 if True: 5834 tmp_files_remove_command = "" 5835 if tmp_files: 5836 tmp_files_remove_command = " ".join(tmp_files) 5837 clean_command = f" rm -f {tmp_files_remove_command} " 5838 log.debug(f"Annotation Annovar - Annotation cleaning ") 5839 log.debug(f"Annotation - cleaning command: {clean_command}") 5840 run_parallel_commands([clean_command], 1) 5841 5842 # Parquet 5843 def annotation_parquet(self, threads: int = None) -> None: 5844 """ 5845 It takes a VCF file, and annotates it with a parquet file 5846 5847 :param threads: number of threads to use for the annotation 5848 :return: the value of the variable "result". 5849 """ 5850 5851 # DEBUG 5852 log.debug("Start annotation with parquet databases") 5853 5854 # Threads 5855 if not threads: 5856 threads = self.get_threads() 5857 log.debug("Threads: " + str(threads)) 5858 5859 # DEBUG 5860 delete_tmp = True 5861 if self.get_config().get("verbosity", "warning") in ["debug"]: 5862 delete_tmp = False 5863 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5864 5865 # Config 5866 databases_folders = set( 5867 self.get_config() 5868 .get("folders", {}) 5869 .get("databases", {}) 5870 .get("annotations", ["."]) 5871 + self.get_config() 5872 .get("folders", {}) 5873 .get("databases", {}) 5874 .get("parquet", ["."]) 5875 ) 5876 log.debug("Databases annotations: " + str(databases_folders)) 5877 5878 # Param 5879 annotations = ( 5880 self.get_param() 5881 .get("annotation", {}) 5882 .get("parquet", {}) 5883 .get("annotations", None) 5884 ) 5885 log.debug("Annotations: " + str(annotations)) 5886 5887 # Assembly 5888 assembly = self.get_param().get( 5889 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5890 ) 5891 5892 # Force Update Annotation 5893 force_update_annotation = ( 5894 self.get_param() 5895 .get("annotation", {}) 5896 .get("options", {}) 5897 .get("annotations_update", False) 5898 ) 5899 log.debug(f"force_update_annotation={force_update_annotation}") 5900 force_append_annotation = ( 5901 self.get_param() 5902 .get("annotation", {}) 5903 .get("options", {}) 5904 .get("annotations_append", False) 5905 ) 5906 log.debug(f"force_append_annotation={force_append_annotation}") 5907 5908 # Data 5909 table_variants = self.get_table_variants() 5910 5911 # Check if not empty 5912 log.debug("Check if not empty") 5913 sql_query_chromosomes_df = self.get_query_to_df( 5914 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5915 ) 5916 if not sql_query_chromosomes_df["count"][0]: 5917 log.info(f"VCF empty") 5918 return 5919 5920 # VCF header 5921 vcf_reader = self.get_header() 5922 log.debug("Initial header: " + str(vcf_reader.infos)) 5923 5924 # Nb Variants POS 5925 log.debug("NB Variants Start") 5926 nb_variants = self.conn.execute( 5927 f"SELECT count(*) AS count FROM variants" 5928 ).fetchdf()["count"][0] 5929 log.debug("NB Variants Stop") 5930 5931 # Existing annotations 5932 for vcf_annotation in self.get_header().infos: 5933 5934 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5935 log.debug( 5936 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5937 ) 5938 5939 # Added columns 5940 added_columns = [] 5941 5942 # drop indexes 5943 log.debug(f"Drop indexes...") 5944 self.drop_indexes() 5945 5946 if annotations: 5947 5948 if "ALL" in annotations: 5949 5950 all_param = annotations.get("ALL", {}) 5951 all_param_formats = all_param.get("formats", None) 5952 all_param_releases = all_param.get("releases", None) 5953 5954 databases_infos_dict = self.scan_databases( 5955 database_formats=all_param_formats, 5956 database_releases=all_param_releases, 5957 ) 5958 for database_infos in databases_infos_dict.keys(): 5959 if database_infos not in annotations: 5960 annotations[database_infos] = {"INFO": None} 5961 5962 for annotation in annotations: 5963 5964 if annotation in ["ALL"]: 5965 continue 5966 5967 # Annotation Name 5968 annotation_name = os.path.basename(annotation) 5969 5970 # Annotation fields 5971 annotation_fields = annotations[annotation] 5972 if not annotation_fields: 5973 annotation_fields = {"INFO": None} 5974 5975 log.debug(f"Annotation '{annotation_name}'") 5976 log.debug( 5977 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5978 ) 5979 5980 # Create Database 5981 database = Database( 5982 database=annotation, 5983 databases_folders=databases_folders, 5984 assembly=assembly, 5985 ) 5986 5987 # Find files 5988 parquet_file = database.get_database() 5989 parquet_hdr_file = database.get_header_file() 5990 parquet_type = database.get_type() 5991 5992 # Check if files exists 5993 if not parquet_file or not parquet_hdr_file: 5994 msg_err_list = [] 5995 if not parquet_file: 5996 msg_err_list.append( 5997 f"Annotation failed: Annotation file not found" 5998 ) 5999 if parquet_file and not parquet_hdr_file: 6000 msg_err_list.append( 6001 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6002 ) 6003 6004 log.error(". ".join(msg_err_list)) 6005 raise ValueError(". ".join(msg_err_list)) 6006 else: 6007 # Get parquet connexion 6008 parquet_sql_attach = database.get_sql_database_attach( 6009 output="query" 6010 ) 6011 if parquet_sql_attach: 6012 self.conn.execute(parquet_sql_attach) 6013 parquet_file_link = database.get_sql_database_link() 6014 # Log 6015 log.debug( 6016 f"Annotation '{annotation_name}' - file: " 6017 + str(parquet_file) 6018 + " and " 6019 + str(parquet_hdr_file) 6020 ) 6021 6022 # Database full header columns 6023 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6024 parquet_hdr_file 6025 ) 6026 # Log 6027 log.debug( 6028 "Annotation database header columns : " 6029 + str(parquet_hdr_vcf_header_columns) 6030 ) 6031 6032 # Load header as VCF object 6033 parquet_hdr_vcf_header_infos = database.get_header().infos 6034 # Log 6035 log.debug( 6036 "Annotation database header: " 6037 + str(parquet_hdr_vcf_header_infos) 6038 ) 6039 6040 # Get extra infos 6041 parquet_columns = database.get_extra_columns() 6042 # Log 6043 log.debug("Annotation database Columns: " + str(parquet_columns)) 6044 6045 # Add extra columns if "ALL" in annotation_fields 6046 # if "ALL" in annotation_fields: 6047 # allow_add_extra_column = True 6048 if "ALL" in annotation_fields and database.get_extra_columns(): 6049 for extra_column in database.get_extra_columns(): 6050 if ( 6051 extra_column not in annotation_fields 6052 and extra_column.replace("INFO/", "") 6053 not in parquet_hdr_vcf_header_infos 6054 ): 6055 parquet_hdr_vcf_header_infos[extra_column] = ( 6056 vcf.parser._Info( 6057 extra_column, 6058 ".", 6059 "String", 6060 f"{extra_column} description", 6061 "unknown", 6062 "unknown", 6063 self.code_type_map["String"], 6064 ) 6065 ) 6066 6067 # For all fields in database 6068 annotation_fields_all = False 6069 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6070 annotation_fields_all = True 6071 annotation_fields = { 6072 key: key for key in parquet_hdr_vcf_header_infos 6073 } 6074 6075 log.debug( 6076 "Annotation database header - All annotations added: " 6077 + str(annotation_fields) 6078 ) 6079 6080 # Init 6081 6082 # List of annotation fields to use 6083 sql_query_annotation_update_info_sets = [] 6084 6085 # List of annotation to agregate 6086 sql_query_annotation_to_agregate = [] 6087 6088 # Number of fields 6089 nb_annotation_field = 0 6090 6091 # Annotation fields processed 6092 annotation_fields_processed = [] 6093 6094 # Columns mapping 6095 map_columns = database.map_columns( 6096 columns=annotation_fields, prefixes=["INFO/"] 6097 ) 6098 6099 # Query dict for fields to remove (update option) 6100 query_dict_remove = {} 6101 6102 # Fetch Anotation fields 6103 for annotation_field in annotation_fields: 6104 6105 # annotation_field_column 6106 annotation_field_column = map_columns.get( 6107 annotation_field, "INFO" 6108 ) 6109 6110 # field new name, if parametered 6111 annotation_fields_new_name = annotation_fields.get( 6112 annotation_field, annotation_field 6113 ) 6114 if not annotation_fields_new_name: 6115 annotation_fields_new_name = annotation_field 6116 6117 # To annotate 6118 # force_update_annotation = True 6119 # force_append_annotation = True 6120 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6121 if annotation_field in parquet_hdr_vcf_header_infos and ( 6122 force_update_annotation 6123 or force_append_annotation 6124 or ( 6125 annotation_fields_new_name 6126 not in self.get_header().infos 6127 ) 6128 ): 6129 6130 # Add field to annotation to process list 6131 annotation_fields_processed.append( 6132 annotation_fields_new_name 6133 ) 6134 6135 # explode infos for the field 6136 annotation_fields_new_name_info_msg = "" 6137 if ( 6138 force_update_annotation 6139 and annotation_fields_new_name 6140 in self.get_header().infos 6141 ): 6142 # Remove field from INFO 6143 query = f""" 6144 UPDATE {table_variants} as table_variants 6145 SET INFO = REGEXP_REPLACE( 6146 concat(table_variants.INFO,''), 6147 ';*{annotation_fields_new_name}=[^;]*', 6148 '' 6149 ) 6150 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6151 """ 6152 annotation_fields_new_name_info_msg = " [update]" 6153 query_dict_remove[ 6154 f"remove 'INFO/{annotation_fields_new_name}'" 6155 ] = query 6156 6157 # Sep between fields in INFO 6158 nb_annotation_field += 1 6159 if nb_annotation_field > 1: 6160 annotation_field_sep = ";" 6161 else: 6162 annotation_field_sep = "" 6163 6164 log.info( 6165 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6166 ) 6167 6168 # Add INFO field to header 6169 parquet_hdr_vcf_header_infos_number = ( 6170 parquet_hdr_vcf_header_infos[annotation_field].num 6171 or "." 6172 ) 6173 parquet_hdr_vcf_header_infos_type = ( 6174 parquet_hdr_vcf_header_infos[annotation_field].type 6175 or "String" 6176 ) 6177 parquet_hdr_vcf_header_infos_description = ( 6178 parquet_hdr_vcf_header_infos[annotation_field].desc 6179 or f"{annotation_field} description" 6180 ) 6181 parquet_hdr_vcf_header_infos_source = ( 6182 parquet_hdr_vcf_header_infos[annotation_field].source 6183 or "unknown" 6184 ) 6185 parquet_hdr_vcf_header_infos_version = ( 6186 parquet_hdr_vcf_header_infos[annotation_field].version 6187 or "unknown" 6188 ) 6189 6190 vcf_reader.infos[annotation_fields_new_name] = ( 6191 vcf.parser._Info( 6192 annotation_fields_new_name, 6193 parquet_hdr_vcf_header_infos_number, 6194 parquet_hdr_vcf_header_infos_type, 6195 parquet_hdr_vcf_header_infos_description, 6196 parquet_hdr_vcf_header_infos_source, 6197 parquet_hdr_vcf_header_infos_version, 6198 self.code_type_map[ 6199 parquet_hdr_vcf_header_infos_type 6200 ], 6201 ) 6202 ) 6203 6204 # Append 6205 if force_append_annotation: 6206 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6207 else: 6208 query_case_when_append = "" 6209 6210 # Annotation/Update query fields 6211 # Found in INFO column 6212 if ( 6213 annotation_field_column == "INFO" 6214 and "INFO" in parquet_hdr_vcf_header_columns 6215 ): 6216 sql_query_annotation_update_info_sets.append( 6217 f""" 6218 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6219 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6220 ELSE '' 6221 END 6222 """ 6223 ) 6224 # Found in a specific column 6225 else: 6226 sql_query_annotation_update_info_sets.append( 6227 f""" 6228 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6229 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6230 ELSE '' 6231 END 6232 """ 6233 ) 6234 sql_query_annotation_to_agregate.append( 6235 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6236 ) 6237 6238 # Not to annotate 6239 else: 6240 6241 if force_update_annotation: 6242 annotation_message = "forced" 6243 else: 6244 annotation_message = "skipped" 6245 6246 if annotation_field not in parquet_hdr_vcf_header_infos: 6247 log.warning( 6248 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6249 ) 6250 if annotation_fields_new_name in self.get_header().infos: 6251 log.warning( 6252 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6253 ) 6254 6255 # Check if ALL fields have to be annotated. Thus concat all INFO field 6256 # allow_annotation_full_info = True 6257 allow_annotation_full_info = not force_append_annotation 6258 6259 if parquet_type in ["regions"]: 6260 allow_annotation_full_info = False 6261 6262 if ( 6263 allow_annotation_full_info 6264 and nb_annotation_field == len(annotation_fields) 6265 and annotation_fields_all 6266 and ( 6267 "INFO" in parquet_hdr_vcf_header_columns 6268 and "INFO" in database.get_extra_columns() 6269 ) 6270 ): 6271 log.debug("Column INFO annotation enabled") 6272 sql_query_annotation_update_info_sets = [] 6273 sql_query_annotation_update_info_sets.append( 6274 f" table_parquet.INFO " 6275 ) 6276 6277 if sql_query_annotation_update_info_sets: 6278 6279 # Annotate 6280 log.info(f"Annotation '{annotation_name}' - Annotation...") 6281 6282 # Join query annotation update info sets for SQL 6283 sql_query_annotation_update_info_sets_sql = ",".join( 6284 sql_query_annotation_update_info_sets 6285 ) 6286 6287 # Check chromosomes list (and variants infos) 6288 sql_query_chromosomes = f""" 6289 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6290 FROM {table_variants} as table_variants 6291 GROUP BY table_variants."#CHROM" 6292 ORDER BY table_variants."#CHROM" 6293 """ 6294 sql_query_chromosomes_df = self.conn.execute( 6295 sql_query_chromosomes 6296 ).df() 6297 sql_query_chromosomes_dict = { 6298 entry["CHROM"]: { 6299 "count": entry["count_variants"], 6300 "min": entry["min_variants"], 6301 "max": entry["max_variants"], 6302 } 6303 for index, entry in sql_query_chromosomes_df.iterrows() 6304 } 6305 6306 # Init 6307 nb_of_query = 0 6308 nb_of_variant_annotated = 0 6309 query_dict = query_dict_remove 6310 6311 # for chrom in sql_query_chromosomes_df["CHROM"]: 6312 for chrom in sql_query_chromosomes_dict: 6313 6314 # Number of variant by chromosome 6315 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6316 chrom, {} 6317 ).get("count", 0) 6318 6319 log.debug( 6320 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6321 ) 6322 6323 # Annotation with regions database 6324 if parquet_type in ["regions"]: 6325 sql_query_annotation_from_clause = f""" 6326 FROM ( 6327 SELECT 6328 '{chrom}' AS \"#CHROM\", 6329 table_variants_from.\"POS\" AS \"POS\", 6330 {",".join(sql_query_annotation_to_agregate)} 6331 FROM {table_variants} as table_variants_from 6332 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6333 table_parquet_from."#CHROM" = '{chrom}' 6334 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6335 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6336 ) 6337 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6338 GROUP BY table_variants_from.\"POS\" 6339 ) 6340 as table_parquet 6341 """ 6342 6343 sql_query_annotation_where_clause = """ 6344 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6345 AND table_parquet.\"POS\" = table_variants.\"POS\" 6346 """ 6347 6348 # Annotation with variants database 6349 else: 6350 sql_query_annotation_from_clause = f""" 6351 FROM {parquet_file_link} as table_parquet 6352 """ 6353 sql_query_annotation_where_clause = f""" 6354 table_variants."#CHROM" = '{chrom}' 6355 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6356 AND table_parquet.\"POS\" = table_variants.\"POS\" 6357 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6358 AND table_parquet.\"REF\" = table_variants.\"REF\" 6359 """ 6360 6361 # Create update query 6362 sql_query_annotation_chrom_interval_pos = f""" 6363 UPDATE {table_variants} as table_variants 6364 SET INFO = 6365 concat( 6366 CASE WHEN table_variants.INFO NOT IN ('','.') 6367 THEN table_variants.INFO 6368 ELSE '' 6369 END 6370 , 6371 CASE WHEN table_variants.INFO NOT IN ('','.') 6372 AND ( 6373 concat({sql_query_annotation_update_info_sets_sql}) 6374 ) 6375 NOT IN ('','.') 6376 THEN ';' 6377 ELSE '' 6378 END 6379 , 6380 {sql_query_annotation_update_info_sets_sql} 6381 ) 6382 {sql_query_annotation_from_clause} 6383 WHERE {sql_query_annotation_where_clause} 6384 ; 6385 """ 6386 6387 # Add update query to dict 6388 query_dict[ 6389 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6390 ] = sql_query_annotation_chrom_interval_pos 6391 6392 nb_of_query = len(query_dict) 6393 num_query = 0 6394 6395 # SET max_expression_depth TO x 6396 self.conn.execute("SET max_expression_depth TO 10000") 6397 6398 for query_name in query_dict: 6399 query = query_dict[query_name] 6400 num_query += 1 6401 log.info( 6402 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6403 ) 6404 result = self.conn.execute(query) 6405 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6406 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6407 log.info( 6408 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6409 ) 6410 6411 log.info( 6412 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6413 ) 6414 6415 else: 6416 6417 log.info( 6418 f"Annotation '{annotation_name}' - No Annotations available" 6419 ) 6420 6421 log.debug("Final header: " + str(vcf_reader.infos)) 6422 6423 # Remove added columns 6424 for added_column in added_columns: 6425 self.drop_column(column=added_column) 6426 6427 def annotation_splice(self, threads: int = None) -> None: 6428 """ 6429 This function annotate with snpEff 6430 6431 :param threads: The number of threads to use 6432 :return: the value of the variable "return_value". 6433 """ 6434 6435 # DEBUG 6436 log.debug("Start annotation with splice tools") 6437 6438 # Threads 6439 if not threads: 6440 threads = self.get_threads() 6441 log.debug("Threads: " + str(threads)) 6442 6443 # DEBUG 6444 delete_tmp = True 6445 if self.get_config().get("verbosity", "warning") in ["debug"]: 6446 delete_tmp = False 6447 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6448 6449 # Config 6450 config = self.get_config() 6451 log.debug("Config: " + str(config)) 6452 splice_config = config.get("tools", {}).get("splice", {}) 6453 if not splice_config: 6454 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6455 msg_err = "No Splice tool config" 6456 raise ValueError(msg_err) 6457 log.debug(f"splice_config: {splice_config}") 6458 6459 # Config - Folders - Databases 6460 databases_folders = ( 6461 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6462 ) 6463 log.debug("Databases annotations: " + str(databases_folders)) 6464 6465 # Splice docker image 6466 splice_docker_image = splice_config.get("docker").get("image") 6467 6468 # Pull splice image if it's not already there 6469 if not check_docker_image_exists(splice_docker_image): 6470 log.warning( 6471 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6472 ) 6473 try: 6474 command(f"docker pull {splice_config.get('docker').get('image')}") 6475 except subprocess.CalledProcessError: 6476 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6477 log.error(msg_err) 6478 raise ValueError(msg_err) 6479 6480 # Config - splice databases 6481 splice_databases = ( 6482 config.get("folders", {}) 6483 .get("databases", {}) 6484 .get("splice", DEFAULT_SPLICE_FOLDER) 6485 ) 6486 splice_databases = full_path(splice_databases) 6487 6488 # Param 6489 param = self.get_param() 6490 log.debug("Param: " + str(param)) 6491 6492 # Param 6493 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6494 log.debug("Options: " + str(options)) 6495 6496 # Data 6497 table_variants = self.get_table_variants() 6498 6499 # Check if not empty 6500 log.debug("Check if not empty") 6501 sql_query_chromosomes = ( 6502 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6503 ) 6504 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6505 log.info("VCF empty") 6506 return None 6507 6508 # Export in VCF 6509 log.debug("Create initial file to annotate") 6510 6511 # Create output folder / work folder 6512 if options.get("output_folder", ""): 6513 output_folder = options.get("output_folder", "") 6514 if not os.path.exists(output_folder): 6515 Path(output_folder).mkdir(parents=True, exist_ok=True) 6516 else: 6517 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6518 if not os.path.exists(output_folder): 6519 Path(output_folder).mkdir(parents=True, exist_ok=True) 6520 6521 if options.get("workdir", ""): 6522 workdir = options.get("workdir", "") 6523 else: 6524 workdir = "/work" 6525 6526 # Create tmp VCF file 6527 tmp_vcf = NamedTemporaryFile( 6528 prefix=self.get_prefix(), 6529 dir=output_folder, 6530 suffix=".vcf", 6531 delete=False, 6532 ) 6533 tmp_vcf_name = tmp_vcf.name 6534 6535 # VCF header 6536 header = self.get_header() 6537 6538 # Existing annotations 6539 for vcf_annotation in self.get_header().infos: 6540 6541 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6542 log.debug( 6543 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6544 ) 6545 6546 # Memory limit 6547 if config.get("memory", None): 6548 memory_limit = config.get("memory", "8G").upper() 6549 # upper() 6550 else: 6551 memory_limit = "8G" 6552 log.debug(f"memory_limit: {memory_limit}") 6553 6554 # Check number of variants to annotate 6555 where_clause_regex_spliceai = r"SpliceAI_\w+" 6556 where_clause_regex_spip = r"SPiP_\w+" 6557 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6558 df_list_of_variants_to_annotate = self.get_query_to_df( 6559 query=f""" SELECT * FROM variants {where_clause} """ 6560 ) 6561 if len(df_list_of_variants_to_annotate) == 0: 6562 log.warning( 6563 f"No variants to annotate with splice. Variants probably already annotated with splice" 6564 ) 6565 return None 6566 else: 6567 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6568 6569 # Export VCF file 6570 self.export_variant_vcf( 6571 vcf_file=tmp_vcf_name, 6572 remove_info=True, 6573 add_samples=True, 6574 index=False, 6575 where_clause=where_clause, 6576 ) 6577 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6578 if any(value for value in splice_config.values() if value is None): 6579 log.warning("At least one splice config parameter is empty") 6580 # exit annotation_splice 6581 return None 6582 6583 # Params in splice nf 6584 def check_values(dico: dict): 6585 """ 6586 Ensure parameters for NF splice pipeline 6587 """ 6588 for key, val in dico.items(): 6589 if key == "genome": 6590 if any( 6591 assemb in options.get("genome", {}) 6592 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6593 ): 6594 yield f"--{key} hg19" 6595 elif any( 6596 assemb in options.get("genome", {}) 6597 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6598 ): 6599 yield f"--{key} hg38" 6600 elif ( 6601 (isinstance(val, str) and val) 6602 or isinstance(val, int) 6603 or isinstance(val, bool) 6604 ): 6605 yield f"--{key} {val}" 6606 6607 # Genome 6608 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6609 options["genome"] = genome 6610 # NF params 6611 nf_params = [] 6612 # Add options 6613 if options: 6614 log.debug(options) 6615 nf_params = list(check_values(options)) 6616 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6617 else: 6618 log.debug("No NF params provided") 6619 # Add threads 6620 if "threads" not in options.keys(): 6621 nf_params.append(f"--threads {threads}") 6622 # Genome path 6623 genome_path = find_genome( 6624 config.get("folders", {}) 6625 .get("databases", {}) 6626 .get("genomes", DEFAULT_GENOME_FOLDER), 6627 file=f"{genome}.fa", 6628 ) 6629 # Add genome path 6630 if not genome_path: 6631 raise ValueError( 6632 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6633 ) 6634 else: 6635 log.debug(f"Genome: {genome_path}") 6636 nf_params.append(f"--genome_path {genome_path}") 6637 6638 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6639 """ 6640 Setting up updated databases for SPiP and SpliceAI 6641 """ 6642 6643 try: 6644 6645 # SpliceAI assembly transcriptome 6646 spliceai_assembly = os.path.join( 6647 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6648 options.get("genome"), 6649 "transcriptome", 6650 ) 6651 spip_assembly = options.get("genome") 6652 6653 spip = find( 6654 f"transcriptome_{spip_assembly}.RData", 6655 config.get("folders", {}).get("databases", {}).get("spip", {}), 6656 ) 6657 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6658 log.debug(f"SPiP annotations: {spip}") 6659 log.debug(f"SpliceAI annotations: {spliceai}") 6660 if spip and spliceai: 6661 return [ 6662 f"--spip_transcriptome {spip}", 6663 f"--spliceai_transcriptome {spliceai}", 6664 ] 6665 else: 6666 log.warning( 6667 "Can't find splice databases in configuration, use annotations file from image" 6668 ) 6669 except TypeError: 6670 log.warning( 6671 "Can't find splice databases in configuration, use annotations file from image" 6672 ) 6673 return [] 6674 6675 # Add options, check if transcriptome option have already beend provided 6676 if ( 6677 "spip_transcriptome" not in nf_params 6678 and "spliceai_transcriptome" not in nf_params 6679 ): 6680 splice_reference = splice_annotations(options, config) 6681 if splice_reference: 6682 nf_params.extend(splice_reference) 6683 # nf_params.append(f"--output_folder {output_folder}") 6684 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6685 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6686 log.debug(cmd) 6687 splice_config["docker"]["command"] = cmd 6688 6689 # Ensure proxy is set 6690 proxy = [ 6691 f"-e {var}={os.getenv(var)}" 6692 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6693 if os.getenv(var) is not None 6694 ] 6695 docker_cmd = get_bin_command( 6696 tool="splice", 6697 bin_type="docker", 6698 config=config, 6699 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6700 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6701 ) 6702 # print(docker_cmd) 6703 # exit() 6704 # Docker debug 6705 # if splice_config.get("rm_container"): 6706 # rm_container = "--rm" 6707 # else: 6708 # rm_container = "" 6709 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6710 log.debug(docker_cmd) 6711 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6712 log.debug(res.stdout) 6713 if res.stderr: 6714 log.error(res.stderr) 6715 res.check_returncode() 6716 # Update variants 6717 log.info("Annotation - Updating...") 6718 # Test find output vcf 6719 log.debug( 6720 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6721 ) 6722 output_vcf = [] 6723 # Wrong folder to look in 6724 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6725 if ( 6726 files 6727 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6728 ): 6729 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6730 # log.debug(os.listdir(options.get("output_folder"))) 6731 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6732 if not output_vcf: 6733 log.debug( 6734 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6735 ) 6736 else: 6737 # Get new header from annotated vcf 6738 log.debug(f"Initial header: {len(header.infos)} fields") 6739 # Create new header with splice infos 6740 new_vcf = Variants(input=output_vcf[0]) 6741 new_vcf_header = new_vcf.get_header().infos 6742 for keys, infos in new_vcf_header.items(): 6743 if keys not in header.infos.keys(): 6744 header.infos[keys] = infos 6745 log.debug(f"New header: {len(header.infos)} fields") 6746 log.debug(f"Splice tmp output: {output_vcf[0]}") 6747 self.update_from_vcf(output_vcf[0]) 6748 6749 # Remove file 6750 remove_if_exists(output_vcf) 6751 6752 ### 6753 # Prioritization 6754 ### 6755 6756 def get_config_default(self, name: str) -> dict: 6757 """ 6758 The function `get_config_default` returns a dictionary containing default configurations for 6759 various calculations and prioritizations. 6760 6761 :param name: The `get_config_default` function returns a dictionary containing default 6762 configurations for different calculations and prioritizations. The `name` parameter is used to 6763 specify which specific configuration to retrieve from the dictionary 6764 :type name: str 6765 :return: The function `get_config_default` returns a dictionary containing default configuration 6766 settings for different calculations and prioritizations. The specific configuration settings are 6767 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6768 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6769 returned. If there is no match, an empty dictionary is returned. 6770 """ 6771 6772 config_default = { 6773 "calculations": { 6774 "variant_chr_pos_alt_ref": { 6775 "type": "sql", 6776 "name": "variant_chr_pos_alt_ref", 6777 "description": "Create a variant ID with chromosome, position, alt and ref", 6778 "available": False, 6779 "output_column_name": "variant_chr_pos_alt_ref", 6780 "output_column_type": "String", 6781 "output_column_description": "variant ID with chromosome, position, alt and ref", 6782 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6783 "operation_info": True, 6784 }, 6785 "VARTYPE": { 6786 "type": "sql", 6787 "name": "VARTYPE", 6788 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6789 "available": True, 6790 "table": "variants", 6791 "output_column_name": "VARTYPE", 6792 "output_column_type": "String", 6793 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6794 "operation_query": """ 6795 CASE 6796 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6797 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6798 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6799 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6800 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6801 ELSE 'UNDEFINED' 6802 END 6803 """, 6804 "info_fields": ["SVTYPE"], 6805 "operation_info": True, 6806 }, 6807 "snpeff_hgvs": { 6808 "type": "python", 6809 "name": "snpeff_hgvs", 6810 "description": "HGVS nomenclatures from snpEff annotation", 6811 "available": True, 6812 "function_name": "calculation_extract_snpeff_hgvs", 6813 "function_params": ["snpeff_hgvs", "ANN"], 6814 }, 6815 "snpeff_ann_explode": { 6816 "type": "python", 6817 "name": "snpeff_ann_explode", 6818 "description": "Explode snpEff annotations with uniquify values", 6819 "available": True, 6820 "function_name": "calculation_snpeff_ann_explode", 6821 "function_params": [False, "fields", "snpeff_", "ANN"], 6822 }, 6823 "snpeff_ann_explode_uniquify": { 6824 "type": "python", 6825 "name": "snpeff_ann_explode_uniquify", 6826 "description": "Explode snpEff annotations", 6827 "available": True, 6828 "function_name": "calculation_snpeff_ann_explode", 6829 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6830 }, 6831 "snpeff_ann_explode_json": { 6832 "type": "python", 6833 "name": "snpeff_ann_explode_json", 6834 "description": "Explode snpEff annotations in JSON format", 6835 "available": True, 6836 "function_name": "calculation_snpeff_ann_explode", 6837 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6838 }, 6839 "NOMEN": { 6840 "type": "python", 6841 "name": "NOMEN", 6842 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6843 "available": True, 6844 "function_name": "calculation_extract_nomen", 6845 "function_params": [], 6846 }, 6847 "RENAME_INFO_FIELDS": { 6848 "type": "python", 6849 "name": "RENAME_INFO_FIELDS", 6850 "description": "Rename or remove INFO/tags", 6851 "available": True, 6852 "function_name": "calculation_rename_info_fields", 6853 "function_params": [], 6854 }, 6855 "FINDBYPIPELINE": { 6856 "type": "python", 6857 "name": "FINDBYPIPELINE", 6858 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6859 "available": True, 6860 "function_name": "calculation_find_by_pipeline", 6861 "function_params": ["findbypipeline"], 6862 }, 6863 "FINDBYSAMPLE": { 6864 "type": "python", 6865 "name": "FINDBYSAMPLE", 6866 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6867 "available": True, 6868 "function_name": "calculation_find_by_pipeline", 6869 "function_params": ["findbysample"], 6870 }, 6871 "GENOTYPECONCORDANCE": { 6872 "type": "python", 6873 "name": "GENOTYPECONCORDANCE", 6874 "description": "Concordance of genotype for multi caller VCF", 6875 "available": True, 6876 "function_name": "calculation_genotype_concordance", 6877 "function_params": [], 6878 }, 6879 "BARCODE": { 6880 "type": "python", 6881 "name": "BARCODE", 6882 "description": "BARCODE as VaRank tool", 6883 "available": True, 6884 "function_name": "calculation_barcode", 6885 "function_params": [], 6886 }, 6887 "BARCODEFAMILY": { 6888 "type": "python", 6889 "name": "BARCODEFAMILY", 6890 "description": "BARCODEFAMILY as VaRank tool", 6891 "available": True, 6892 "function_name": "calculation_barcode_family", 6893 "function_params": ["BCF"], 6894 }, 6895 "TRIO": { 6896 "type": "python", 6897 "name": "TRIO", 6898 "description": "Inheritance for a trio family", 6899 "available": True, 6900 "function_name": "calculation_trio", 6901 "function_params": [], 6902 }, 6903 "VAF": { 6904 "type": "python", 6905 "name": "VAF", 6906 "description": "Variant Allele Frequency (VAF) harmonization", 6907 "available": True, 6908 "function_name": "calculation_vaf_normalization", 6909 "function_params": [], 6910 }, 6911 "VAF_stats": { 6912 "type": "python", 6913 "name": "VAF_stats", 6914 "description": "Variant Allele Frequency (VAF) statistics", 6915 "available": True, 6916 "function_name": "calculation_genotype_stats", 6917 "function_params": ["VAF"], 6918 }, 6919 "DP_stats": { 6920 "type": "python", 6921 "name": "DP_stats", 6922 "description": "Depth (DP) statistics", 6923 "available": True, 6924 "function_name": "calculation_genotype_stats", 6925 "function_params": ["DP"], 6926 }, 6927 "variant_id": { 6928 "type": "python", 6929 "name": "variant_id", 6930 "description": "Variant ID generated from variant position and type", 6931 "available": True, 6932 "function_name": "calculation_variant_id", 6933 "function_params": [], 6934 }, 6935 "transcripts_json": { 6936 "type": "python", 6937 "name": "transcripts_json", 6938 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6939 "available": True, 6940 "function_name": "calculation_transcripts_annotation", 6941 "function_params": ["transcripts_json", None], 6942 }, 6943 "transcripts_ann": { 6944 "type": "python", 6945 "name": "transcripts_ann", 6946 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6947 "available": True, 6948 "function_name": "calculation_transcripts_annotation", 6949 "function_params": [None, "transcripts_ann"], 6950 }, 6951 "transcripts_annotations": { 6952 "type": "python", 6953 "name": "transcripts_annotations", 6954 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6955 "available": True, 6956 "function_name": "calculation_transcripts_annotation", 6957 "function_params": [None, None], 6958 }, 6959 "transcripts_prioritization": { 6960 "type": "python", 6961 "name": "transcripts_prioritization", 6962 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6963 "available": True, 6964 "function_name": "calculation_transcripts_prioritization", 6965 "function_params": [], 6966 }, 6967 "transcripts_export": { 6968 "type": "python", 6969 "name": "transcripts_export", 6970 "description": "Export transcripts table/view as a file (using param.json)", 6971 "available": True, 6972 "function_name": "calculation_transcripts_export", 6973 "function_params": [], 6974 }, 6975 }, 6976 "prioritizations": { 6977 "default": { 6978 "ANN2": [ 6979 { 6980 "type": "contains", 6981 "value": "HIGH", 6982 "score": 5, 6983 "flag": "PASS", 6984 "comment": [ 6985 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6986 ], 6987 }, 6988 { 6989 "type": "contains", 6990 "value": "MODERATE", 6991 "score": 3, 6992 "flag": "PASS", 6993 "comment": [ 6994 "A non-disruptive variant that might change protein effectiveness" 6995 ], 6996 }, 6997 { 6998 "type": "contains", 6999 "value": "LOW", 7000 "score": 0, 7001 "flag": "FILTERED", 7002 "comment": [ 7003 "Assumed to be mostly harmless or unlikely to change protein behavior" 7004 ], 7005 }, 7006 { 7007 "type": "contains", 7008 "value": "MODIFIER", 7009 "score": 0, 7010 "flag": "FILTERED", 7011 "comment": [ 7012 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7013 ], 7014 }, 7015 ], 7016 } 7017 }, 7018 } 7019 7020 return config_default.get(name, None) 7021 7022 def get_config_json( 7023 self, name: str, config_dict: dict = {}, config_file: str = None 7024 ) -> dict: 7025 """ 7026 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7027 default values, a dictionary, and a file. 7028 7029 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7030 the name of the configuration. It is used to identify and retrieve the configuration settings 7031 for a specific component or module 7032 :type name: str 7033 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7034 dictionary that allows you to provide additional configuration settings or overrides. When you 7035 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7036 the key is the configuration setting you want to override or 7037 :type config_dict: dict 7038 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7039 specify the path to a configuration file that contains additional settings. If provided, the 7040 function will read the contents of this file and update the configuration dictionary with the 7041 values found in the file, overriding any existing values with the 7042 :type config_file: str 7043 :return: The function `get_config_json` returns a dictionary containing the configuration 7044 settings. 7045 """ 7046 7047 # Create with default prioritizations 7048 config_default = self.get_config_default(name=name) 7049 configuration = config_default 7050 # log.debug(f"configuration={configuration}") 7051 7052 # Replace prioritizations from dict 7053 for config in config_dict: 7054 configuration[config] = config_dict[config] 7055 7056 # Replace prioritizations from file 7057 config_file = full_path(config_file) 7058 if config_file: 7059 if os.path.exists(config_file): 7060 with open(config_file) as config_file_content: 7061 config_file_dict = yaml.safe_load(config_file_content) 7062 for config in config_file_dict: 7063 configuration[config] = config_file_dict[config] 7064 else: 7065 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7066 log.error(msg_error) 7067 raise ValueError(msg_error) 7068 7069 return configuration 7070 7071 def prioritization( 7072 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7073 ) -> bool: 7074 """ 7075 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7076 prioritizes variants based on configured profiles and criteria. 7077 7078 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7079 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7080 a table name is provided, the method will prioritize the variants in that specific table 7081 :type table: str 7082 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7083 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7084 provided, the code will use a default prefix value of "PZ" 7085 :type pz_prefix: str 7086 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7087 additional parameters specific to the prioritization process. These parameters can include 7088 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7089 configurations needed for the prioritization of variants in a V 7090 :type pz_param: dict 7091 :return: A boolean value (True) is being returned from the `prioritization` function. 7092 """ 7093 7094 # Config 7095 config = self.get_config() 7096 7097 # Param 7098 param = self.get_param() 7099 7100 # Prioritization param 7101 if pz_param is not None: 7102 prioritization_param = pz_param 7103 else: 7104 prioritization_param = param.get("prioritization", {}) 7105 7106 # Configuration profiles 7107 prioritization_config_file = prioritization_param.get( 7108 "prioritization_config", None 7109 ) 7110 prioritization_config_file = full_path(prioritization_config_file) 7111 prioritizations_config = self.get_config_json( 7112 name="prioritizations", config_file=prioritization_config_file 7113 ) 7114 7115 # Prioritization prefix 7116 pz_prefix_default = "PZ" 7117 if pz_prefix is None: 7118 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7119 7120 # Prioritization options 7121 profiles = prioritization_param.get("profiles", []) 7122 if isinstance(profiles, str): 7123 profiles = profiles.split(",") 7124 pzfields = prioritization_param.get( 7125 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7126 ) 7127 if isinstance(pzfields, str): 7128 pzfields = pzfields.split(",") 7129 default_profile = prioritization_param.get("default_profile", None) 7130 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7131 prioritization_score_mode = prioritization_param.get( 7132 "prioritization_score_mode", "HOWARD" 7133 ) 7134 7135 # Quick Prioritizations 7136 prioritizations = param.get("prioritizations", None) 7137 if prioritizations: 7138 log.info("Quick Prioritization:") 7139 for profile in prioritizations.split(","): 7140 if profile not in profiles: 7141 profiles.append(profile) 7142 log.info(f" {profile}") 7143 7144 # If profile "ALL" provided, all profiles in the config profiles 7145 if "ALL" in profiles: 7146 profiles = list(prioritizations_config.keys()) 7147 7148 for profile in profiles: 7149 if prioritizations_config.get(profile, None): 7150 log.debug(f"Profile '{profile}' configured") 7151 else: 7152 msg_error = f"Profile '{profile}' NOT configured" 7153 log.error(msg_error) 7154 raise ValueError(msg_error) 7155 7156 if profiles: 7157 log.info(f"Prioritization... ") 7158 else: 7159 log.debug(f"No profile defined") 7160 return False 7161 7162 if not default_profile and len(profiles): 7163 default_profile = profiles[0] 7164 7165 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7166 log.debug("Profiles to check: " + str(list(profiles))) 7167 7168 # Variables 7169 if table is not None: 7170 table_variants = table 7171 else: 7172 table_variants = self.get_table_variants(clause="update") 7173 log.debug(f"Table to prioritize: {table_variants}") 7174 7175 # Added columns 7176 added_columns = [] 7177 7178 # Create list of PZfields 7179 # List of PZFields 7180 list_of_pzfields_original = pzfields + [ 7181 pzfield + pzfields_sep + profile 7182 for pzfield in pzfields 7183 for profile in profiles 7184 ] 7185 list_of_pzfields = [] 7186 log.debug(f"{list_of_pzfields_original}") 7187 7188 # Remove existing PZfields to use if exists 7189 for pzfield in list_of_pzfields_original: 7190 if self.get_header().infos.get(pzfield, None) is None: 7191 list_of_pzfields.append(pzfield) 7192 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7193 else: 7194 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7195 7196 if list_of_pzfields: 7197 7198 # Explode Infos prefix 7199 explode_infos_prefix = self.get_explode_infos_prefix() 7200 7201 # PZfields tags description 7202 PZfields_INFOS = { 7203 f"{pz_prefix}Tags": { 7204 "ID": f"{pz_prefix}Tags", 7205 "Number": ".", 7206 "Type": "String", 7207 "Description": "Variant tags based on annotation criteria", 7208 }, 7209 f"{pz_prefix}Score": { 7210 "ID": f"{pz_prefix}Score", 7211 "Number": 1, 7212 "Type": "Integer", 7213 "Description": "Variant score based on annotation criteria", 7214 }, 7215 f"{pz_prefix}Flag": { 7216 "ID": f"{pz_prefix}Flag", 7217 "Number": 1, 7218 "Type": "String", 7219 "Description": "Variant flag based on annotation criteria", 7220 }, 7221 f"{pz_prefix}Comment": { 7222 "ID": f"{pz_prefix}Comment", 7223 "Number": ".", 7224 "Type": "String", 7225 "Description": "Variant comment based on annotation criteria", 7226 }, 7227 f"{pz_prefix}Infos": { 7228 "ID": f"{pz_prefix}Infos", 7229 "Number": ".", 7230 "Type": "String", 7231 "Description": "Variant infos based on annotation criteria", 7232 }, 7233 f"{pz_prefix}Class": { 7234 "ID": f"{pz_prefix}Class", 7235 "Number": ".", 7236 "Type": "String", 7237 "Description": "Variant class based on annotation criteria", 7238 }, 7239 } 7240 7241 # Create INFO fields if not exist 7242 for field in PZfields_INFOS: 7243 field_ID = PZfields_INFOS[field]["ID"] 7244 field_description = PZfields_INFOS[field]["Description"] 7245 if field_ID not in self.get_header().infos and field_ID in pzfields: 7246 field_description = ( 7247 PZfields_INFOS[field]["Description"] 7248 + f", profile {default_profile}" 7249 ) 7250 self.get_header().infos[field_ID] = vcf.parser._Info( 7251 field_ID, 7252 PZfields_INFOS[field]["Number"], 7253 PZfields_INFOS[field]["Type"], 7254 field_description, 7255 "unknown", 7256 "unknown", 7257 code_type_map[PZfields_INFOS[field]["Type"]], 7258 ) 7259 7260 # Create INFO fields if not exist for each profile 7261 for profile in prioritizations_config: 7262 if profile in profiles or profiles == []: 7263 for field in PZfields_INFOS: 7264 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7265 field_description = ( 7266 PZfields_INFOS[field]["Description"] 7267 + f", profile {profile}" 7268 ) 7269 if ( 7270 field_ID not in self.get_header().infos 7271 and field in pzfields 7272 ): 7273 self.get_header().infos[field_ID] = vcf.parser._Info( 7274 field_ID, 7275 PZfields_INFOS[field]["Number"], 7276 PZfields_INFOS[field]["Type"], 7277 field_description, 7278 "unknown", 7279 "unknown", 7280 code_type_map[PZfields_INFOS[field]["Type"]], 7281 ) 7282 7283 # Header 7284 for pzfield in list_of_pzfields: 7285 if re.match(f"{pz_prefix}Score.*", pzfield): 7286 added_column = self.add_column( 7287 table_name=table_variants, 7288 column_name=pzfield, 7289 column_type="INTEGER", 7290 default_value="0", 7291 ) 7292 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7293 added_column = self.add_column( 7294 table_name=table_variants, 7295 column_name=pzfield, 7296 column_type="BOOLEAN", 7297 default_value="1", 7298 ) 7299 elif re.match(f"{pz_prefix}Class.*", pzfield): 7300 added_column = self.add_column( 7301 table_name=table_variants, 7302 column_name=pzfield, 7303 column_type="VARCHAR[]", 7304 default_value="null", 7305 ) 7306 else: 7307 added_column = self.add_column( 7308 table_name=table_variants, 7309 column_name=pzfield, 7310 column_type="STRING", 7311 default_value="''", 7312 ) 7313 added_columns.append(added_column) 7314 7315 # Profiles 7316 if profiles: 7317 7318 # foreach profile in configuration file 7319 for profile in prioritizations_config: 7320 7321 # If profile is asked in param, or ALL are asked (empty profile []) 7322 if profile in profiles or profiles == []: 7323 log.info(f"Profile '{profile}'") 7324 7325 sql_set_info_option = "" 7326 7327 sql_set_info = [] 7328 7329 # PZ fields set 7330 7331 # PZScore 7332 if ( 7333 f"{pz_prefix}Score{pzfields_sep}{profile}" 7334 in list_of_pzfields 7335 ): 7336 sql_set_info.append( 7337 f""" 7338 concat( 7339 '{pz_prefix}Score{pzfields_sep}{profile}=', 7340 {pz_prefix}Score{pzfields_sep}{profile} 7341 ) 7342 """ 7343 ) 7344 if ( 7345 profile == default_profile 7346 and f"{pz_prefix}Score" in list_of_pzfields 7347 ): 7348 sql_set_info.append( 7349 f""" 7350 concat( 7351 '{pz_prefix}Score=', 7352 {pz_prefix}Score{pzfields_sep}{profile} 7353 ) 7354 """ 7355 ) 7356 7357 # PZFlag 7358 if ( 7359 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7360 in list_of_pzfields 7361 ): 7362 sql_set_info.append( 7363 f""" 7364 concat( 7365 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7366 CASE 7367 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7368 THEN 'PASS' 7369 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7370 THEN 'FILTERED' 7371 END 7372 ) 7373 """ 7374 ) 7375 if ( 7376 profile == default_profile 7377 and f"{pz_prefix}Flag" in list_of_pzfields 7378 ): 7379 sql_set_info.append( 7380 f""" 7381 concat( 7382 '{pz_prefix}Flag=', 7383 CASE 7384 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7385 THEN 'PASS' 7386 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7387 THEN 'FILTERED' 7388 END 7389 ) 7390 """ 7391 ) 7392 7393 # PZClass 7394 if ( 7395 f"{pz_prefix}Class{pzfields_sep}{profile}" 7396 in list_of_pzfields 7397 ): 7398 sql_set_info.append( 7399 f""" 7400 concat( 7401 '{pz_prefix}Class{pzfields_sep}{profile}=', 7402 CASE 7403 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7404 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7405 ELSE '.' 7406 END 7407 ) 7408 7409 """ 7410 ) 7411 if ( 7412 profile == default_profile 7413 and f"{pz_prefix}Class" in list_of_pzfields 7414 ): 7415 sql_set_info.append( 7416 f""" 7417 concat( 7418 '{pz_prefix}Class=', 7419 CASE 7420 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7421 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7422 ELSE '.' 7423 END 7424 ) 7425 """ 7426 ) 7427 7428 # PZComment 7429 if ( 7430 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7431 in list_of_pzfields 7432 ): 7433 sql_set_info.append( 7434 f""" 7435 CASE 7436 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7437 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7438 ELSE '' 7439 END 7440 """ 7441 ) 7442 if ( 7443 profile == default_profile 7444 and f"{pz_prefix}Comment" in list_of_pzfields 7445 ): 7446 sql_set_info.append( 7447 f""" 7448 CASE 7449 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7450 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7451 ELSE '' 7452 END 7453 """ 7454 ) 7455 7456 # PZInfos 7457 if ( 7458 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7459 in list_of_pzfields 7460 ): 7461 sql_set_info.append( 7462 f""" 7463 CASE 7464 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7465 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7466 ELSE '' 7467 END 7468 """ 7469 ) 7470 if ( 7471 profile == default_profile 7472 and f"{pz_prefix}Infos" in list_of_pzfields 7473 ): 7474 sql_set_info.append( 7475 f""" 7476 CASE 7477 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7478 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7479 ELSE '' 7480 END 7481 """ 7482 ) 7483 7484 # Merge PZfields 7485 sql_set_info_option = "" 7486 sql_set_sep = "" 7487 for sql_set in sql_set_info: 7488 if sql_set_sep: 7489 sql_set_info_option += f""" 7490 , concat('{sql_set_sep}', {sql_set}) 7491 """ 7492 else: 7493 sql_set_info_option += f""" 7494 , {sql_set} 7495 """ 7496 sql_set_sep = ";" 7497 7498 sql_queries = [] 7499 for annotation in prioritizations_config[profile]: 7500 7501 # skip special sections 7502 if annotation.startswith("_"): 7503 continue 7504 7505 # For each criterions 7506 for criterion in prioritizations_config[profile][ 7507 annotation 7508 ]: 7509 7510 # Criterion mode 7511 criterion_mode = None 7512 if np.any( 7513 np.isin(list(criterion.keys()), ["type", "value"]) 7514 ): 7515 criterion_mode = "operation" 7516 elif np.any( 7517 np.isin(list(criterion.keys()), ["sql", "fields"]) 7518 ): 7519 criterion_mode = "sql" 7520 log.debug(f"Criterion Mode: {criterion_mode}") 7521 7522 # Criterion parameters 7523 criterion_type = criterion.get("type", None) 7524 criterion_value = criterion.get("value", None) 7525 criterion_sql = criterion.get("sql", None) 7526 criterion_fields = criterion.get("fields", None) 7527 criterion_score = criterion.get("score", 0) 7528 criterion_flag = criterion.get("flag", "PASS") 7529 criterion_class = criterion.get("class", None) 7530 criterion_flag_bool = criterion_flag == "PASS" 7531 criterion_comment = ( 7532 ", ".join(criterion.get("comment", [])) 7533 .replace("'", "''") 7534 .replace(";", ",") 7535 .replace("\t", " ") 7536 ) 7537 criterion_infos = ( 7538 str(criterion) 7539 .replace("'", "''") 7540 .replace(";", ",") 7541 .replace("\t", " ") 7542 ) 7543 7544 # SQL 7545 if criterion_sql is not None and isinstance( 7546 criterion_sql, list 7547 ): 7548 criterion_sql = " ".join(criterion_sql) 7549 7550 # Fields and explode 7551 if criterion_fields is None: 7552 criterion_fields = [annotation] 7553 if not isinstance(criterion_fields, list): 7554 criterion_fields = str(criterion_fields).split(",") 7555 7556 # Class 7557 if criterion_class is not None and not isinstance( 7558 criterion_class, list 7559 ): 7560 criterion_class = str(criterion_class).split(",") 7561 7562 for annotation_field in criterion_fields: 7563 7564 # Explode specific annotation 7565 log.debug( 7566 f"Explode annotation '{annotation_field}'" 7567 ) 7568 added_columns += self.explode_infos( 7569 prefix=explode_infos_prefix, 7570 fields=[annotation_field], 7571 table=table_variants, 7572 ) 7573 extra_infos = self.get_extra_infos( 7574 table=table_variants 7575 ) 7576 7577 # Check if annotation field is present 7578 if ( 7579 f"{explode_infos_prefix}{annotation_field}" 7580 not in extra_infos 7581 ): 7582 msq_err = f"Annotation '{annotation_field}' not in data" 7583 log.error(msq_err) 7584 raise ValueError(msq_err) 7585 else: 7586 log.debug( 7587 f"Annotation '{annotation_field}' in data" 7588 ) 7589 7590 sql_set = [] 7591 sql_set_info = [] 7592 7593 # PZ fields set 7594 7595 # PZScore 7596 if ( 7597 f"{pz_prefix}Score{pzfields_sep}{profile}" 7598 in list_of_pzfields 7599 ): 7600 # VaRank prioritization score mode 7601 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7602 sql_set.append( 7603 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7604 ) 7605 # default HOWARD prioritization score mode 7606 else: 7607 sql_set.append( 7608 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7609 ) 7610 7611 # PZFlag 7612 if ( 7613 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7614 in list_of_pzfields 7615 ): 7616 sql_set.append( 7617 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7618 ) 7619 7620 # PZClass 7621 if ( 7622 f"{pz_prefix}Class{pzfields_sep}{profile}" 7623 in list_of_pzfields 7624 and criterion_class is not None 7625 ): 7626 sql_set.append( 7627 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7628 ) 7629 7630 # PZComment 7631 if ( 7632 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7633 in list_of_pzfields 7634 ): 7635 sql_set.append( 7636 f""" 7637 {pz_prefix}Comment{pzfields_sep}{profile} = 7638 concat( 7639 {pz_prefix}Comment{pzfields_sep}{profile}, 7640 CASE 7641 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7642 THEN ', ' 7643 ELSE '' 7644 END, 7645 '{criterion_comment}' 7646 ) 7647 """ 7648 ) 7649 7650 # PZInfos 7651 if ( 7652 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7653 in list_of_pzfields 7654 ): 7655 sql_set.append( 7656 f""" 7657 {pz_prefix}Infos{pzfields_sep}{profile} = 7658 concat( 7659 {pz_prefix}Infos{pzfields_sep}{profile}, 7660 '{criterion_infos}' 7661 ) 7662 """ 7663 ) 7664 sql_set_option = ",".join(sql_set) 7665 7666 # Criterion and comparison 7667 if sql_set_option: 7668 7669 if criterion_mode in ["operation"]: 7670 7671 try: 7672 float(criterion_value) 7673 sql_update = f""" 7674 UPDATE {table_variants} 7675 SET {sql_set_option} 7676 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7677 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7678 """ 7679 except: 7680 contains_option = "" 7681 if criterion_type == "contains": 7682 contains_option = ".*" 7683 sql_update = f""" 7684 UPDATE {table_variants} 7685 SET {sql_set_option} 7686 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7687 """ 7688 sql_queries.append(sql_update) 7689 7690 elif criterion_mode in ["sql"]: 7691 7692 sql_update = f""" 7693 UPDATE {table_variants} 7694 SET {sql_set_option} 7695 WHERE {criterion_sql} 7696 """ 7697 sql_queries.append(sql_update) 7698 7699 else: 7700 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7701 log.error(msg_err) 7702 raise ValueError(msg_err) 7703 7704 else: 7705 log.warning( 7706 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7707 ) 7708 7709 # PZTags 7710 if ( 7711 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7712 in list_of_pzfields 7713 ): 7714 7715 # Create PZFalgs value 7716 pztags_value = "" 7717 pztags_sep_default = "," 7718 pztags_sep = "" 7719 for pzfield in pzfields: 7720 if pzfield not in [f"{pz_prefix}Tags"]: 7721 if ( 7722 f"{pzfield}{pzfields_sep}{profile}" 7723 in list_of_pzfields 7724 ): 7725 if pzfield in [f"{pz_prefix}Flag"]: 7726 pztags_value += f"""{pztags_sep}{pzfield}#', 7727 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7728 THEN 'PASS' 7729 ELSE 'FILTERED' 7730 END, '""" 7731 elif pzfield in [f"{pz_prefix}Class"]: 7732 pztags_value += f"""{pztags_sep}{pzfield}#', 7733 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7734 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7735 ELSE '.' 7736 END, '""" 7737 else: 7738 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7739 pztags_sep = pztags_sep_default 7740 7741 # Add Query update for PZFlags 7742 sql_update_pztags = f""" 7743 UPDATE {table_variants} 7744 SET INFO = concat( 7745 INFO, 7746 CASE WHEN INFO NOT in ('','.') 7747 THEN ';' 7748 ELSE '' 7749 END, 7750 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7751 ) 7752 """ 7753 sql_queries.append(sql_update_pztags) 7754 7755 # Add Query update for PZFlags for default 7756 if profile == default_profile: 7757 sql_update_pztags_default = f""" 7758 UPDATE {table_variants} 7759 SET INFO = concat( 7760 INFO, 7761 ';', 7762 '{pz_prefix}Tags={pztags_value}' 7763 ) 7764 """ 7765 sql_queries.append(sql_update_pztags_default) 7766 7767 log.info(f"""Profile '{profile}' - Prioritization... """) 7768 7769 if sql_queries: 7770 7771 for sql_query in sql_queries: 7772 log.debug( 7773 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7774 ) 7775 self.conn.execute(sql_query) 7776 7777 log.info(f"""Profile '{profile}' - Update... """) 7778 sql_query_update = f""" 7779 UPDATE {table_variants} 7780 SET INFO = 7781 concat( 7782 CASE 7783 WHEN INFO NOT IN ('','.') 7784 THEN concat(INFO, ';') 7785 ELSE '' 7786 END 7787 {sql_set_info_option} 7788 ) 7789 """ 7790 self.conn.execute(sql_query_update) 7791 7792 else: 7793 7794 log.warning(f"No profiles in parameters") 7795 7796 # Remove added columns 7797 for added_column in added_columns: 7798 self.drop_column(column=added_column) 7799 7800 # Explode INFOS fields into table fields 7801 if self.get_explode_infos(): 7802 self.explode_infos( 7803 prefix=self.get_explode_infos_prefix(), 7804 fields=self.get_explode_infos_fields(), 7805 force=True, 7806 ) 7807 7808 return True 7809 7810 ### 7811 # HGVS 7812 ### 7813 7814 def annotation_hgvs(self, threads: int = None) -> None: 7815 """ 7816 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7817 coordinates and alleles. 7818 7819 :param threads: The `threads` parameter is an optional integer that specifies the number of 7820 threads to use for parallel processing. If no value is provided, it will default to the number 7821 of threads obtained from the `get_threads()` method 7822 :type threads: int 7823 """ 7824 7825 # Function for each partition of the Dask Dataframe 7826 def partition_function(partition): 7827 """ 7828 The function `partition_function` applies the `annotation_hgvs_partition` function to 7829 each row of a DataFrame called `partition`. 7830 7831 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7832 to be processed 7833 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7834 the "partition" dataframe along the axis 1. 7835 """ 7836 return partition.apply(annotation_hgvs_partition, axis=1) 7837 7838 def annotation_hgvs_partition(row) -> str: 7839 """ 7840 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7841 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7842 7843 :param row: A dictionary-like object that contains the values for the following keys: 7844 :return: a string that contains the HGVS names associated with the given row of data. 7845 """ 7846 7847 chr = row["CHROM"] 7848 pos = row["POS"] 7849 ref = row["REF"] 7850 alt = row["ALT"] 7851 7852 # Find list of associated transcripts 7853 transcripts_list = list( 7854 polars_conn.execute( 7855 f""" 7856 SELECT transcript 7857 FROM refseq_df 7858 WHERE CHROM='{chr}' 7859 AND POS={pos} 7860 """ 7861 )["transcript"] 7862 ) 7863 7864 # Full HGVS annotation in list 7865 hgvs_full_list = [] 7866 7867 for transcript_name in transcripts_list: 7868 7869 # Transcript 7870 transcript = get_transcript( 7871 transcripts=transcripts, transcript_name=transcript_name 7872 ) 7873 # Exon 7874 if use_exon: 7875 exon = transcript.find_exon_number(pos) 7876 else: 7877 exon = None 7878 # Protein 7879 transcript_protein = None 7880 if use_protein or add_protein or full_format: 7881 transcripts_protein = list( 7882 polars_conn.execute( 7883 f""" 7884 SELECT protein 7885 FROM refseqlink_df 7886 WHERE transcript='{transcript_name}' 7887 LIMIT 1 7888 """ 7889 )["protein"] 7890 ) 7891 if len(transcripts_protein): 7892 transcript_protein = transcripts_protein[0] 7893 7894 # HGVS name 7895 hgvs_name = format_hgvs_name( 7896 chr, 7897 pos, 7898 ref, 7899 alt, 7900 genome=genome, 7901 transcript=transcript, 7902 transcript_protein=transcript_protein, 7903 exon=exon, 7904 use_gene=use_gene, 7905 use_protein=use_protein, 7906 full_format=full_format, 7907 use_version=use_version, 7908 codon_type=codon_type, 7909 ) 7910 hgvs_full_list.append(hgvs_name) 7911 if add_protein and not use_protein and not full_format: 7912 hgvs_name = format_hgvs_name( 7913 chr, 7914 pos, 7915 ref, 7916 alt, 7917 genome=genome, 7918 transcript=transcript, 7919 transcript_protein=transcript_protein, 7920 exon=exon, 7921 use_gene=use_gene, 7922 use_protein=True, 7923 full_format=False, 7924 use_version=use_version, 7925 codon_type=codon_type, 7926 ) 7927 hgvs_full_list.append(hgvs_name) 7928 7929 # Create liste of HGVS annotations 7930 hgvs_full = ",".join(hgvs_full_list) 7931 7932 return hgvs_full 7933 7934 # Polars connexion 7935 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7936 7937 # Config 7938 config = self.get_config() 7939 7940 # Databases 7941 # Genome 7942 databases_genomes_folders = ( 7943 config.get("folders", {}) 7944 .get("databases", {}) 7945 .get("genomes", DEFAULT_GENOME_FOLDER) 7946 ) 7947 databases_genome = ( 7948 config.get("folders", {}).get("databases", {}).get("genomes", "") 7949 ) 7950 # refseq database folder 7951 databases_refseq_folders = ( 7952 config.get("folders", {}) 7953 .get("databases", {}) 7954 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7955 ) 7956 # refseq 7957 databases_refseq = config.get("databases", {}).get("refSeq", None) 7958 # refSeqLink 7959 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7960 7961 # Param 7962 param = self.get_param() 7963 7964 # Quick HGVS 7965 if "hgvs_options" in param and param.get("hgvs_options", ""): 7966 log.info(f"Quick HGVS Annotation:") 7967 if not param.get("hgvs", None): 7968 param["hgvs"] = {} 7969 for option in param.get("hgvs_options", "").split(","): 7970 option_var_val = option.split("=") 7971 option_var = option_var_val[0] 7972 if len(option_var_val) > 1: 7973 option_val = option_var_val[1] 7974 else: 7975 option_val = "True" 7976 if option_val.upper() in ["TRUE"]: 7977 option_val = True 7978 elif option_val.upper() in ["FALSE"]: 7979 option_val = False 7980 log.info(f" {option_var}={option_val}") 7981 param["hgvs"][option_var] = option_val 7982 7983 # Check if HGVS annotation enabled 7984 if "hgvs" in param: 7985 log.info(f"HGVS Annotation... ") 7986 for hgvs_option in param.get("hgvs", {}): 7987 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7988 else: 7989 return 7990 7991 # HGVS Param 7992 param_hgvs = param.get("hgvs", {}) 7993 use_exon = param_hgvs.get("use_exon", False) 7994 use_gene = param_hgvs.get("use_gene", False) 7995 use_protein = param_hgvs.get("use_protein", False) 7996 add_protein = param_hgvs.get("add_protein", False) 7997 full_format = param_hgvs.get("full_format", False) 7998 use_version = param_hgvs.get("use_version", False) 7999 codon_type = param_hgvs.get("codon_type", "3") 8000 8001 # refSseq refSeqLink 8002 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8003 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8004 8005 # Assembly 8006 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8007 8008 # Genome 8009 genome_file = None 8010 if find_genome(databases_genome): 8011 genome_file = find_genome(databases_genome) 8012 else: 8013 genome_file = find_genome( 8014 genome_path=databases_genomes_folders, assembly=assembly 8015 ) 8016 log.debug("Genome: " + str(genome_file)) 8017 8018 # refSseq 8019 refseq_file = find_file_prefix( 8020 input_file=databases_refseq, 8021 prefix="ncbiRefSeq", 8022 folder=databases_refseq_folders, 8023 assembly=assembly, 8024 ) 8025 log.debug("refSeq: " + str(refseq_file)) 8026 8027 # refSeqLink 8028 refseqlink_file = find_file_prefix( 8029 input_file=databases_refseqlink, 8030 prefix="ncbiRefSeqLink", 8031 folder=databases_refseq_folders, 8032 assembly=assembly, 8033 ) 8034 log.debug("refSeqLink: " + str(refseqlink_file)) 8035 8036 # Threads 8037 if not threads: 8038 threads = self.get_threads() 8039 log.debug("Threads: " + str(threads)) 8040 8041 # Variables 8042 table_variants = self.get_table_variants(clause="update") 8043 8044 # Get variants SNV and InDel only 8045 query_variants = f""" 8046 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8047 FROM {table_variants} 8048 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8049 """ 8050 df_variants = self.get_query_to_df(query_variants) 8051 8052 # Added columns 8053 added_columns = [] 8054 8055 # Add hgvs column in variants table 8056 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8057 added_column = self.add_column( 8058 table_variants, hgvs_column_name, "STRING", default_value=None 8059 ) 8060 added_columns.append(added_column) 8061 8062 log.debug(f"refSeq loading...") 8063 # refSeq in duckDB 8064 refseq_table = get_refseq_table( 8065 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8066 ) 8067 # Loading all refSeq in Dataframe 8068 refseq_query = f""" 8069 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8070 FROM {refseq_table} 8071 JOIN df_variants ON ( 8072 {refseq_table}.chrom = df_variants.CHROM 8073 AND {refseq_table}.txStart<=df_variants.POS 8074 AND {refseq_table}.txEnd>=df_variants.POS 8075 ) 8076 """ 8077 refseq_df = self.conn.query(refseq_query).pl() 8078 8079 if refseqlink_file: 8080 log.debug(f"refSeqLink loading...") 8081 # refSeqLink in duckDB 8082 refseqlink_table = get_refseq_table( 8083 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8084 ) 8085 # Loading all refSeqLink in Dataframe 8086 protacc_column = "protAcc_with_ver" 8087 mrnaacc_column = "mrnaAcc_with_ver" 8088 refseqlink_query = f""" 8089 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8090 FROM {refseqlink_table} 8091 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8092 WHERE protAcc_without_ver IS NOT NULL 8093 """ 8094 # Polars Dataframe 8095 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8096 8097 # Read RefSeq transcripts into a python dict/model. 8098 log.debug(f"Transcripts loading...") 8099 with tempfile.TemporaryDirectory() as tmpdir: 8100 transcripts_query = f""" 8101 COPY ( 8102 SELECT {refseq_table}.* 8103 FROM {refseq_table} 8104 JOIN df_variants ON ( 8105 {refseq_table}.chrom=df_variants.CHROM 8106 AND {refseq_table}.txStart<=df_variants.POS 8107 AND {refseq_table}.txEnd>=df_variants.POS 8108 ) 8109 ) 8110 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8111 """ 8112 self.conn.query(transcripts_query) 8113 with open(f"{tmpdir}/transcript.tsv") as infile: 8114 transcripts = read_transcripts(infile) 8115 8116 # Polars connexion 8117 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8118 8119 log.debug("Genome loading...") 8120 # Read genome sequence using pyfaidx. 8121 genome = Fasta(genome_file) 8122 8123 log.debug("Start annotation HGVS...") 8124 8125 # Create 8126 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8127 ddf = dd.from_pandas(df_variants, npartitions=threads) 8128 8129 # Use dask.dataframe.apply() to apply function on each partition 8130 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8131 8132 # Convert Dask DataFrame to Pandas Dataframe 8133 df = ddf.compute() 8134 8135 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8136 with tempfile.TemporaryDirectory() as tmpdir: 8137 df_parquet = os.path.join(tmpdir, "df.parquet") 8138 df.to_parquet(df_parquet) 8139 8140 # Update hgvs column 8141 update_variant_query = f""" 8142 UPDATE {table_variants} 8143 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8144 FROM read_parquet('{df_parquet}') as df 8145 WHERE variants."#CHROM" = df.CHROM 8146 AND variants.POS = df.POS 8147 AND variants.REF = df.REF 8148 AND variants.ALT = df.ALT 8149 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8150 """ 8151 self.execute_query(update_variant_query) 8152 8153 # Update INFO column 8154 sql_query_update = f""" 8155 UPDATE {table_variants} 8156 SET INFO = 8157 concat( 8158 CASE 8159 WHEN INFO NOT IN ('','.') 8160 THEN concat(INFO, ';') 8161 ELSE '' 8162 END, 8163 'hgvs=', 8164 {hgvs_column_name} 8165 ) 8166 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8167 """ 8168 self.execute_query(sql_query_update) 8169 8170 # Add header 8171 HGVS_INFOS = { 8172 "hgvs": { 8173 "ID": "hgvs", 8174 "Number": ".", 8175 "Type": "String", 8176 "Description": f"HGVS annotatation with HOWARD", 8177 } 8178 } 8179 8180 for field in HGVS_INFOS: 8181 field_ID = HGVS_INFOS[field]["ID"] 8182 field_description = HGVS_INFOS[field]["Description"] 8183 self.get_header().infos[field_ID] = vcf.parser._Info( 8184 field_ID, 8185 HGVS_INFOS[field]["Number"], 8186 HGVS_INFOS[field]["Type"], 8187 field_description, 8188 "unknown", 8189 "unknown", 8190 code_type_map[HGVS_INFOS[field]["Type"]], 8191 ) 8192 8193 # Remove added columns 8194 for added_column in added_columns: 8195 self.drop_column(column=added_column) 8196 8197 ### 8198 # Calculation 8199 ### 8200 8201 def get_operations_help( 8202 self, operations_config_dict: dict = {}, operations_config_file: str = None 8203 ) -> list: 8204 8205 # Init 8206 operations_help = [] 8207 8208 # operations 8209 operations = self.get_config_json( 8210 name="calculations", 8211 config_dict=operations_config_dict, 8212 config_file=operations_config_file, 8213 ) 8214 for op in operations: 8215 op_name = operations[op].get("name", op).upper() 8216 op_description = operations[op].get("description", op_name) 8217 op_available = operations[op].get("available", False) 8218 if op_available: 8219 operations_help.append(f" {op_name}: {op_description}") 8220 8221 # Sort operations 8222 operations_help.sort() 8223 8224 # insert header 8225 operations_help.insert(0, "Available calculation operations:") 8226 8227 # Return 8228 return operations_help 8229 8230 def calculation( 8231 self, 8232 operations: dict = {}, 8233 operations_config_dict: dict = {}, 8234 operations_config_file: str = None, 8235 ) -> None: 8236 """ 8237 It takes a list of operations, and for each operation, it checks if it's a python or sql 8238 operation, and then calls the appropriate function 8239 8240 param json example: 8241 "calculation": { 8242 "NOMEN": { 8243 "options": { 8244 "hgvs_field": "hgvs" 8245 }, 8246 "middle" : null 8247 } 8248 """ 8249 8250 # Param 8251 param = self.get_param() 8252 8253 # CHeck operations config file 8254 if operations_config_file is None: 8255 operations_config_file = param.get("calculation", {}).get( 8256 "calculation_config", None 8257 ) 8258 8259 # operations config 8260 operations_config = self.get_config_json( 8261 name="calculations", 8262 config_dict=operations_config_dict, 8263 config_file=operations_config_file, 8264 ) 8265 8266 # Upper keys 8267 operations_config = {k.upper(): v for k, v in operations_config.items()} 8268 8269 # Calculations 8270 8271 # Operations from param 8272 operations = param.get("calculation", {}).get("calculations", operations) 8273 8274 # Quick calculation - add 8275 if param.get("calculations", None): 8276 8277 # List of operations 8278 calculations_list = [ 8279 value.strip() for value in param.get("calculations", "").split(",") 8280 ] 8281 8282 # Log 8283 log.info(f"Quick Calculations:") 8284 for calculation_key in calculations_list: 8285 log.info(f" {calculation_key}") 8286 8287 # Create tmp operations (to keep operation order) 8288 operations_tmp = {} 8289 for calculation_operation in calculations_list: 8290 if calculation_operation.upper() not in operations_tmp: 8291 log.debug( 8292 f"{calculation_operation}.upper() not in {operations_tmp}" 8293 ) 8294 operations_tmp[calculation_operation.upper()] = {} 8295 add_value_into_dict( 8296 dict_tree=operations_tmp, 8297 sections=[ 8298 calculation_operation.upper(), 8299 ], 8300 value=operations.get(calculation_operation.upper(), {}), 8301 ) 8302 # Add operations already in param 8303 for calculation_operation in operations: 8304 if calculation_operation not in operations_tmp: 8305 operations_tmp[calculation_operation] = operations.get( 8306 calculation_operation, {} 8307 ) 8308 8309 # Update operations in param 8310 operations = operations_tmp 8311 8312 # Operations for calculation 8313 if not operations: 8314 operations = param.get("calculation", {}).get("calculations", {}) 8315 8316 if operations: 8317 log.info(f"Calculations...") 8318 8319 # For each operations 8320 for operation_name in operations: 8321 operation_name = operation_name.upper() 8322 if operation_name not in [""]: 8323 if operation_name in operations_config: 8324 log.info(f"Calculation '{operation_name}'") 8325 operation = operations_config[operation_name] 8326 operation_type = operation.get("type", "sql") 8327 if operation_type == "python": 8328 self.calculation_process_function( 8329 operation=operation, operation_name=operation_name 8330 ) 8331 elif operation_type == "sql": 8332 self.calculation_process_sql( 8333 operation=operation, operation_name=operation_name 8334 ) 8335 else: 8336 log.error( 8337 f"Operations config: Type '{operation_type}' NOT available" 8338 ) 8339 raise ValueError( 8340 f"Operations config: Type '{operation_type}' NOT available" 8341 ) 8342 else: 8343 log.error( 8344 f"Operations config: Calculation '{operation_name}' NOT available" 8345 ) 8346 raise ValueError( 8347 f"Operations config: Calculation '{operation_name}' NOT available" 8348 ) 8349 8350 # Explode INFOS fields into table fields 8351 if self.get_explode_infos(): 8352 self.explode_infos( 8353 prefix=self.get_explode_infos_prefix(), 8354 fields=self.get_explode_infos_fields(), 8355 force=True, 8356 ) 8357 8358 def calculation_process_sql( 8359 self, operation: dict, operation_name: str = "unknown" 8360 ) -> None: 8361 """ 8362 The `calculation_process_sql` function takes in a mathematical operation as a string and 8363 performs the operation, updating the specified table with the result. 8364 8365 :param operation: The `operation` parameter is a dictionary that contains information about the 8366 mathematical operation to be performed. It includes the following keys: 8367 :type operation: dict 8368 :param operation_name: The `operation_name` parameter is a string that represents the name of 8369 the mathematical operation being performed. It is used for logging and error handling purposes, 8370 defaults to unknown 8371 :type operation_name: str (optional) 8372 """ 8373 8374 # Operation infos 8375 operation_name = operation.get("name", "unknown") 8376 log.debug(f"process SQL {operation_name}") 8377 output_column_name = operation.get("output_column_name", operation_name) 8378 output_column_type = operation.get("output_column_type", "String") 8379 prefix = operation.get("explode_infos_prefix", "") 8380 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8381 output_column_description = operation.get( 8382 "output_column_description", f"{operation_name} operation" 8383 ) 8384 operation_query = operation.get("operation_query", None) 8385 if isinstance(operation_query, list): 8386 operation_query = " ".join(operation_query) 8387 operation_info_fields = operation.get("info_fields", []) 8388 operation_info_fields_check = operation.get("info_fields_check", False) 8389 operation_info = operation.get("operation_info", True) 8390 operation_table = operation.get( 8391 "table", self.get_table_variants(clause="alter") 8392 ) 8393 8394 # table variants 8395 if operation_table: 8396 table_variants = operation_table 8397 else: 8398 table_variants = self.get_table_variants(clause="alter") 8399 8400 if operation_query: 8401 8402 # Info fields check 8403 operation_info_fields_check_result = True 8404 if operation_info_fields_check: 8405 header_infos = self.get_header().infos 8406 for info_field in operation_info_fields: 8407 operation_info_fields_check_result = ( 8408 operation_info_fields_check_result 8409 and info_field in header_infos 8410 ) 8411 8412 # If info fields available 8413 if operation_info_fields_check_result: 8414 8415 # Added_columns 8416 added_columns = [] 8417 8418 # Create VCF header field 8419 vcf_reader = self.get_header() 8420 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8421 output_column_name, 8422 ".", 8423 output_column_type, 8424 output_column_description, 8425 "howard calculation", 8426 "0", 8427 self.code_type_map.get(output_column_type), 8428 ) 8429 8430 # Explode infos if needed 8431 log.debug(f"calculation_process_sql prefix {prefix}") 8432 added_columns += self.explode_infos( 8433 prefix=prefix, 8434 fields=[output_column_name] + operation_info_fields, 8435 force=False, 8436 table=table_variants, 8437 ) 8438 8439 # Create column 8440 added_column = self.add_column( 8441 table_name=table_variants, 8442 column_name=prefix + output_column_name, 8443 column_type=output_column_type_sql, 8444 default_value="null", 8445 ) 8446 added_columns.append(added_column) 8447 8448 # Operation calculation 8449 try: 8450 8451 # Query to update calculation column 8452 sql_update = f""" 8453 UPDATE {table_variants} 8454 SET "{prefix}{output_column_name}" = ({operation_query}) 8455 """ 8456 self.conn.execute(sql_update) 8457 8458 # Add to INFO 8459 if operation_info: 8460 sql_update_info = f""" 8461 UPDATE {table_variants} 8462 SET "INFO" = 8463 concat( 8464 CASE 8465 WHEN "INFO" IS NOT NULL 8466 THEN concat("INFO", ';') 8467 ELSE '' 8468 END, 8469 '{output_column_name}=', 8470 "{prefix}{output_column_name}" 8471 ) 8472 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8473 """ 8474 self.conn.execute(sql_update_info) 8475 8476 except: 8477 log.error( 8478 f"Operations config: Calculation '{operation_name}' query failed" 8479 ) 8480 raise ValueError( 8481 f"Operations config: Calculation '{operation_name}' query failed" 8482 ) 8483 8484 # Remove added columns 8485 for added_column in added_columns: 8486 log.debug(f"added_column: {added_column}") 8487 self.drop_column(column=added_column) 8488 8489 else: 8490 log.error( 8491 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8492 ) 8493 raise ValueError( 8494 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8495 ) 8496 8497 else: 8498 log.error( 8499 f"Operations config: Calculation '{operation_name}' query NOT defined" 8500 ) 8501 raise ValueError( 8502 f"Operations config: Calculation '{operation_name}' query NOT defined" 8503 ) 8504 8505 def calculation_process_function( 8506 self, operation: dict, operation_name: str = "unknown" 8507 ) -> None: 8508 """ 8509 The `calculation_process_function` takes in an operation dictionary and performs the specified 8510 function with the given parameters. 8511 8512 :param operation: The `operation` parameter is a dictionary that contains information about the 8513 operation to be performed. It has the following keys: 8514 :type operation: dict 8515 :param operation_name: The `operation_name` parameter is a string that represents the name of 8516 the operation being performed. It is used for logging purposes, defaults to unknown 8517 :type operation_name: str (optional) 8518 """ 8519 8520 operation_name = operation["name"] 8521 log.debug(f"process Python {operation_name}") 8522 function_name = operation["function_name"] 8523 function_params = operation["function_params"] 8524 getattr(self, function_name)(*function_params) 8525 8526 def calculation_variant_id(self) -> None: 8527 """ 8528 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8529 updates the INFO field of a variants table with the variant ID. 8530 """ 8531 8532 # variant_id annotation field 8533 variant_id_tag = self.get_variant_id_column() 8534 added_columns = [variant_id_tag] 8535 8536 # variant_id hgvs tags" 8537 vcf_infos_tags = { 8538 variant_id_tag: "howard variant ID annotation", 8539 } 8540 8541 # Variants table 8542 table_variants = self.get_table_variants() 8543 8544 # Header 8545 vcf_reader = self.get_header() 8546 8547 # Add variant_id to header 8548 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8549 variant_id_tag, 8550 ".", 8551 "String", 8552 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8553 "howard calculation", 8554 "0", 8555 self.code_type_map.get("String"), 8556 ) 8557 8558 # Update 8559 sql_update = f""" 8560 UPDATE {table_variants} 8561 SET "INFO" = 8562 concat( 8563 CASE 8564 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8565 THEN '' 8566 ELSE concat("INFO", ';') 8567 END, 8568 '{variant_id_tag}=', 8569 "{variant_id_tag}" 8570 ) 8571 """ 8572 self.conn.execute(sql_update) 8573 8574 # Remove added columns 8575 for added_column in added_columns: 8576 self.drop_column(column=added_column) 8577 8578 def calculation_extract_snpeff_hgvs( 8579 self, 8580 snpeff_hgvs: str = "snpeff_hgvs", 8581 snpeff_field: str = "ANN", 8582 ) -> None: 8583 """ 8584 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8585 annotation field in a VCF file and adds them as a new column in the variants table. 8586 8587 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8588 function is used to specify the name of the column that will store the HGVS nomenclatures 8589 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8590 snpeff_hgvs 8591 :type snpeff_hgvs: str (optional) 8592 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8593 function represents the field in the VCF file that contains SnpEff annotations. This field is 8594 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8595 to ANN 8596 :type snpeff_field: str (optional) 8597 """ 8598 8599 # Snpeff hgvs tags 8600 vcf_infos_tags = { 8601 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8602 } 8603 8604 # Prefix 8605 prefix = self.get_explode_infos_prefix() 8606 if prefix: 8607 prefix = "INFO/" 8608 8609 # snpEff fields 8610 speff_ann_infos = prefix + snpeff_field 8611 speff_hgvs_infos = prefix + snpeff_hgvs 8612 8613 # Variants table 8614 table_variants = self.get_table_variants() 8615 8616 # Header 8617 vcf_reader = self.get_header() 8618 8619 # Add columns 8620 added_columns = [] 8621 8622 # Explode HGVS field in column 8623 added_columns += self.explode_infos(fields=[snpeff_field]) 8624 8625 if snpeff_field in vcf_reader.infos: 8626 8627 log.debug(vcf_reader.infos[snpeff_field]) 8628 8629 # Extract ANN header 8630 ann_description = vcf_reader.infos[snpeff_field].desc 8631 pattern = r"'(.+?)'" 8632 match = re.search(pattern, ann_description) 8633 if match: 8634 ann_header_match = match.group(1).split(" | ") 8635 ann_header_desc = {} 8636 for i in range(len(ann_header_match)): 8637 ann_header_info = "".join( 8638 char for char in ann_header_match[i] if char.isalnum() 8639 ) 8640 ann_header_desc[ann_header_info] = ann_header_match[i] 8641 if not ann_header_desc: 8642 raise ValueError("Invalid header description format") 8643 else: 8644 raise ValueError("Invalid header description format") 8645 8646 # Create variant id 8647 variant_id_column = self.get_variant_id_column() 8648 added_columns += [variant_id_column] 8649 8650 # Create dataframe 8651 dataframe_snpeff_hgvs = self.get_query_to_df( 8652 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8653 ) 8654 8655 # Create main NOMEN column 8656 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8657 speff_ann_infos 8658 ].apply( 8659 lambda x: extract_snpeff_hgvs( 8660 str(x), header=list(ann_header_desc.values()) 8661 ) 8662 ) 8663 8664 # Add snpeff_hgvs to header 8665 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8666 snpeff_hgvs, 8667 ".", 8668 "String", 8669 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8670 "howard calculation", 8671 "0", 8672 self.code_type_map.get("String"), 8673 ) 8674 8675 # Update 8676 sql_update = f""" 8677 UPDATE variants 8678 SET "INFO" = 8679 concat( 8680 CASE 8681 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8682 THEN '' 8683 ELSE concat("INFO", ';') 8684 END, 8685 CASE 8686 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8687 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8688 THEN concat( 8689 '{snpeff_hgvs}=', 8690 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8691 ) 8692 ELSE '' 8693 END 8694 ) 8695 FROM dataframe_snpeff_hgvs 8696 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8697 8698 """ 8699 self.conn.execute(sql_update) 8700 8701 # Delete dataframe 8702 del dataframe_snpeff_hgvs 8703 gc.collect() 8704 8705 else: 8706 8707 log.warning( 8708 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8709 ) 8710 8711 # Remove added columns 8712 for added_column in added_columns: 8713 self.drop_column(column=added_column) 8714 8715 def calculation_snpeff_ann_explode( 8716 self, 8717 uniquify: bool = True, 8718 output_format: str = "fields", 8719 output_prefix: str = "snpeff_", 8720 snpeff_field: str = "ANN", 8721 ) -> None: 8722 """ 8723 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8724 exploding the HGVS field and updating variant information accordingly. 8725 8726 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8727 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8728 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8729 defaults to True 8730 :type uniquify: bool (optional) 8731 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8732 function specifies the format in which the output annotations will be generated. It has a 8733 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8734 format, defaults to fields 8735 :type output_format: str (optional) 8736 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8737 method is used to specify the prefix that will be added to the output annotations generated 8738 during the calculation process. This prefix helps to differentiate the newly added annotations 8739 from existing ones in the output data. By default, the, defaults to ANN_ 8740 :type output_prefix: str (optional) 8741 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8742 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8743 field will be processed to explode the HGVS annotations and update the variant information 8744 accordingly, defaults to ANN 8745 :type snpeff_field: str (optional) 8746 """ 8747 8748 # SnpEff annotation field 8749 snpeff_hgvs = "snpeff_ann_explode" 8750 8751 # Snpeff hgvs tags 8752 vcf_infos_tags = { 8753 snpeff_hgvs: "Explode snpEff annotations", 8754 } 8755 8756 # Prefix 8757 prefix = self.get_explode_infos_prefix() 8758 if prefix: 8759 prefix = "INFO/" 8760 8761 # snpEff fields 8762 speff_ann_infos = prefix + snpeff_field 8763 speff_hgvs_infos = prefix + snpeff_hgvs 8764 8765 # Variants table 8766 table_variants = self.get_table_variants() 8767 8768 # Header 8769 vcf_reader = self.get_header() 8770 8771 # Add columns 8772 added_columns = [] 8773 8774 # Explode HGVS field in column 8775 added_columns += self.explode_infos(fields=[snpeff_field]) 8776 log.debug(f"snpeff_field={snpeff_field}") 8777 log.debug(f"added_columns={added_columns}") 8778 8779 if snpeff_field in vcf_reader.infos: 8780 8781 # Extract ANN header 8782 ann_description = vcf_reader.infos[snpeff_field].desc 8783 pattern = r"'(.+?)'" 8784 match = re.search(pattern, ann_description) 8785 if match: 8786 ann_header_match = match.group(1).split(" | ") 8787 ann_header = [] 8788 ann_header_desc = {} 8789 for i in range(len(ann_header_match)): 8790 ann_header_info = "".join( 8791 char for char in ann_header_match[i] if char.isalnum() 8792 ) 8793 ann_header.append(ann_header_info) 8794 ann_header_desc[ann_header_info] = ann_header_match[i] 8795 if not ann_header_desc: 8796 raise ValueError("Invalid header description format") 8797 else: 8798 raise ValueError("Invalid header description format") 8799 8800 # Create variant id 8801 variant_id_column = self.get_variant_id_column() 8802 added_columns += [variant_id_column] 8803 8804 # Create dataframe 8805 dataframe_snpeff_hgvs = self.get_query_to_df( 8806 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8807 ) 8808 8809 # Create snpEff columns 8810 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8811 speff_ann_infos 8812 ].apply( 8813 lambda x: explode_snpeff_ann( 8814 str(x), 8815 uniquify=uniquify, 8816 output_format=output_format, 8817 prefix=output_prefix, 8818 header=list(ann_header_desc.values()), 8819 ) 8820 ) 8821 8822 # Header 8823 ann_annotations_prefix = "" 8824 if output_format.upper() in ["JSON"]: 8825 ann_annotations_prefix = f"{output_prefix}=" 8826 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8827 output_prefix, 8828 ".", 8829 "String", 8830 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8831 + " - JSON format", 8832 "howard calculation", 8833 "0", 8834 self.code_type_map.get("String"), 8835 ) 8836 else: 8837 for ann_annotation in ann_header: 8838 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8839 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8840 ann_annotation_id, 8841 ".", 8842 "String", 8843 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8844 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8845 "howard calculation", 8846 "0", 8847 self.code_type_map.get("String"), 8848 ) 8849 8850 # Update 8851 sql_update = f""" 8852 UPDATE variants 8853 SET "INFO" = 8854 concat( 8855 CASE 8856 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8857 THEN '' 8858 ELSE concat("INFO", ';') 8859 END, 8860 CASE 8861 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8862 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8863 THEN concat( 8864 '{ann_annotations_prefix}', 8865 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8866 ) 8867 ELSE '' 8868 END 8869 ) 8870 FROM dataframe_snpeff_hgvs 8871 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8872 8873 """ 8874 self.conn.execute(sql_update) 8875 8876 # Delete dataframe 8877 del dataframe_snpeff_hgvs 8878 gc.collect() 8879 8880 else: 8881 8882 log.warning( 8883 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8884 ) 8885 8886 # Remove added columns 8887 for added_column in added_columns: 8888 self.drop_column(column=added_column) 8889 8890 def calculation_extract_nomen(self) -> None: 8891 """ 8892 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8893 """ 8894 8895 # NOMEN field 8896 field_nomen_dict = "NOMEN_DICT" 8897 8898 # NOMEN structure 8899 nomen_dict = { 8900 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8901 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8902 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8903 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8904 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8905 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8906 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8907 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8908 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8909 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8910 } 8911 8912 # Param 8913 param = self.get_param() 8914 8915 # Threads 8916 threads = self.get_threads() 8917 8918 # Prefix 8919 prefix = self.get_explode_infos_prefix() 8920 8921 # Header 8922 vcf_reader = self.get_header() 8923 8924 # Added columns 8925 added_columns = [] 8926 8927 # Get HGVS field 8928 hgvs_field = ( 8929 param.get("calculation", {}) 8930 .get("calculations", {}) 8931 .get("NOMEN", {}) 8932 .get("options", {}) 8933 .get("hgvs_field", "hgvs") 8934 ) 8935 8936 # Get NOMEN pattern 8937 nomen_pattern = ( 8938 param.get("calculation", {}) 8939 .get("calculations", {}) 8940 .get("NOMEN", {}) 8941 .get("options", {}) 8942 .get("pattern", None) 8943 ) 8944 8945 # transcripts list of preference sources 8946 transcripts_sources = {} 8947 8948 # Get transcripts 8949 transcripts_file = ( 8950 param.get("calculation", {}) 8951 .get("calculations", {}) 8952 .get("NOMEN", {}) 8953 .get("options", {}) 8954 .get("transcripts", None) 8955 ) 8956 transcripts_file = full_path(transcripts_file) 8957 if transcripts_file: 8958 if os.path.exists(transcripts_file): 8959 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8960 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8961 transcripts_sources["file"] = transcripts_from_file 8962 else: 8963 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8964 log.error(msg_err) 8965 raise ValueError(msg_err) 8966 8967 # Get transcripts table 8968 transcripts_table = ( 8969 param.get("calculation", {}) 8970 .get("calculations", {}) 8971 .get("NOMEN", {}) 8972 .get("options", {}) 8973 .get("transcripts_table", self.get_table_variants()) 8974 ) 8975 # Get transcripts column 8976 transcripts_column = ( 8977 param.get("calculation", {}) 8978 .get("calculations", {}) 8979 .get("NOMEN", {}) 8980 .get("options", {}) 8981 .get("transcripts_column", None) 8982 ) 8983 8984 if transcripts_table and transcripts_column: 8985 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8986 # Explode if not exists 8987 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8988 else: 8989 extra_field_transcript = f"NULL" 8990 8991 # Transcripts of preference source order 8992 transcripts_order = ( 8993 param.get("calculation", {}) 8994 .get("calculations", {}) 8995 .get("NOMEN", {}) 8996 .get("options", {}) 8997 .get("transcripts_order", ["column", "file"]) 8998 ) 8999 9000 # Transcripts from file 9001 transcripts = transcripts_sources.get("file", []) 9002 9003 # Explode HGVS field in column 9004 added_columns += self.explode_infos(fields=[hgvs_field]) 9005 9006 # extra infos 9007 extra_infos = self.get_extra_infos() 9008 extra_field = prefix + hgvs_field 9009 9010 if extra_field in extra_infos: 9011 9012 # Create dataframe 9013 dataframe_hgvs = self.get_query_to_df( 9014 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9015 ) 9016 9017 # Transcripts rank 9018 transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)} 9019 transcripts_len = len(transcripts_rank) 9020 9021 # Create main NOMEN column 9022 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9023 lambda x: find_nomen( 9024 hgvs=x.hgvs, 9025 transcript=x.transcript, 9026 transcripts=transcripts_rank, 9027 pattern=nomen_pattern, 9028 transcripts_source_order=transcripts_order, 9029 transcripts_len=transcripts_len 9030 ), 9031 axis=1, 9032 ) 9033 9034 # Explode NOMEN Structure and create SQL set for update 9035 sql_nomen_fields = [] 9036 for nomen_field in nomen_dict: 9037 9038 # Create VCF header field 9039 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9040 nomen_field, 9041 ".", 9042 "String", 9043 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9044 "howard calculation", 9045 "0", 9046 self.code_type_map.get("String"), 9047 ) 9048 9049 # Add field to SQL query update 9050 sql_nomen_fields.append( 9051 f""" 9052 CASE 9053 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9054 THEN concat( 9055 ';{nomen_field}=', 9056 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9057 ) 9058 ELSE '' 9059 END 9060 """ 9061 ) 9062 9063 # SQL set for update 9064 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9065 9066 # Update 9067 sql_update = f""" 9068 UPDATE variants 9069 SET "INFO" = 9070 concat( 9071 CASE 9072 WHEN "INFO" IS NULL 9073 THEN '' 9074 ELSE "INFO" 9075 END, 9076 {sql_nomen_fields_set} 9077 ) 9078 FROM dataframe_hgvs 9079 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9080 AND variants."POS" = dataframe_hgvs."POS" 9081 AND variants."REF" = dataframe_hgvs."REF" 9082 AND variants."ALT" = dataframe_hgvs."ALT" 9083 """ 9084 self.conn.execute(sql_update) 9085 9086 # Delete dataframe 9087 del dataframe_hgvs 9088 gc.collect() 9089 9090 # Remove added columns 9091 for added_column in added_columns: 9092 self.drop_column(column=added_column) 9093 9094 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9095 """ 9096 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9097 pipeline/sample for a variant and updates the variant information in a VCF file. 9098 9099 :param tag: The `tag` parameter is a string that represents the annotation field for the 9100 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9101 VCF header and to update the corresponding field in the variants table, defaults to 9102 findbypipeline 9103 :type tag: str (optional) 9104 """ 9105 9106 # if FORMAT and samples 9107 if ( 9108 "FORMAT" in self.get_header_columns_as_list() 9109 and self.get_header_sample_list() 9110 ): 9111 9112 # findbypipeline annotation field 9113 findbypipeline_tag = tag 9114 9115 # VCF infos tags 9116 vcf_infos_tags = { 9117 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9118 } 9119 9120 # Prefix 9121 prefix = self.get_explode_infos_prefix() 9122 9123 # Field 9124 findbypipeline_infos = prefix + findbypipeline_tag 9125 9126 # Variants table 9127 table_variants = self.get_table_variants() 9128 9129 # Header 9130 vcf_reader = self.get_header() 9131 9132 # Create variant id 9133 variant_id_column = self.get_variant_id_column() 9134 added_columns = [variant_id_column] 9135 9136 # variant_id, FORMAT and samples 9137 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9138 self.get_header_sample_list() 9139 ) 9140 9141 # Create dataframe 9142 dataframe_findbypipeline = self.get_query_to_df( 9143 f""" SELECT {samples_fields} FROM {table_variants} """ 9144 ) 9145 9146 # Create findbypipeline column 9147 dataframe_findbypipeline[findbypipeline_infos] = ( 9148 dataframe_findbypipeline.apply( 9149 lambda row: findbypipeline( 9150 row, samples=self.get_header_sample_list() 9151 ), 9152 axis=1, 9153 ) 9154 ) 9155 9156 # Add snpeff_hgvs to header 9157 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9158 findbypipeline_tag, 9159 ".", 9160 "String", 9161 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9162 "howard calculation", 9163 "0", 9164 self.code_type_map.get("String"), 9165 ) 9166 9167 # Update 9168 sql_update = f""" 9169 UPDATE variants 9170 SET "INFO" = 9171 concat( 9172 CASE 9173 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9174 THEN '' 9175 ELSE concat("INFO", ';') 9176 END, 9177 CASE 9178 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9179 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9180 THEN concat( 9181 '{findbypipeline_tag}=', 9182 dataframe_findbypipeline."{findbypipeline_infos}" 9183 ) 9184 ELSE '' 9185 END 9186 ) 9187 FROM dataframe_findbypipeline 9188 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9189 """ 9190 self.conn.execute(sql_update) 9191 9192 # Remove added columns 9193 for added_column in added_columns: 9194 self.drop_column(column=added_column) 9195 9196 # Delete dataframe 9197 del dataframe_findbypipeline 9198 gc.collect() 9199 9200 def calculation_genotype_concordance(self) -> None: 9201 """ 9202 The function `calculation_genotype_concordance` calculates the genotype concordance for 9203 multi-caller VCF files and updates the variant information in the database. 9204 """ 9205 9206 # if FORMAT and samples 9207 if ( 9208 "FORMAT" in self.get_header_columns_as_list() 9209 and self.get_header_sample_list() 9210 ): 9211 9212 # genotypeconcordance annotation field 9213 genotypeconcordance_tag = "genotypeconcordance" 9214 9215 # VCF infos tags 9216 vcf_infos_tags = { 9217 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9218 } 9219 9220 # Prefix 9221 prefix = self.get_explode_infos_prefix() 9222 9223 # Field 9224 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9225 9226 # Variants table 9227 table_variants = self.get_table_variants() 9228 9229 # Header 9230 vcf_reader = self.get_header() 9231 9232 # Create variant id 9233 variant_id_column = self.get_variant_id_column() 9234 added_columns = [variant_id_column] 9235 9236 # variant_id, FORMAT and samples 9237 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9238 self.get_header_sample_list() 9239 ) 9240 9241 # Create dataframe 9242 dataframe_genotypeconcordance = self.get_query_to_df( 9243 f""" SELECT {samples_fields} FROM {table_variants} """ 9244 ) 9245 9246 # Create genotypeconcordance column 9247 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9248 dataframe_genotypeconcordance.apply( 9249 lambda row: genotypeconcordance( 9250 row, samples=self.get_header_sample_list() 9251 ), 9252 axis=1, 9253 ) 9254 ) 9255 9256 # Add genotypeconcordance to header 9257 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9258 genotypeconcordance_tag, 9259 ".", 9260 "String", 9261 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9262 "howard calculation", 9263 "0", 9264 self.code_type_map.get("String"), 9265 ) 9266 9267 # Update 9268 sql_update = f""" 9269 UPDATE variants 9270 SET "INFO" = 9271 concat( 9272 CASE 9273 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9274 THEN '' 9275 ELSE concat("INFO", ';') 9276 END, 9277 CASE 9278 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9279 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9280 THEN concat( 9281 '{genotypeconcordance_tag}=', 9282 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9283 ) 9284 ELSE '' 9285 END 9286 ) 9287 FROM dataframe_genotypeconcordance 9288 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9289 """ 9290 self.conn.execute(sql_update) 9291 9292 # Remove added columns 9293 for added_column in added_columns: 9294 self.drop_column(column=added_column) 9295 9296 # Delete dataframe 9297 del dataframe_genotypeconcordance 9298 gc.collect() 9299 9300 def calculation_barcode(self, tag: str = "barcode") -> None: 9301 """ 9302 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9303 updates the INFO field in the file with the calculated barcode values. 9304 9305 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9306 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9307 the default tag name is set to "barcode", defaults to barcode 9308 :type tag: str (optional) 9309 """ 9310 9311 # if FORMAT and samples 9312 if ( 9313 "FORMAT" in self.get_header_columns_as_list() 9314 and self.get_header_sample_list() 9315 ): 9316 9317 # barcode annotation field 9318 if not tag: 9319 tag = "barcode" 9320 9321 # VCF infos tags 9322 vcf_infos_tags = { 9323 tag: "barcode calculation (VaRank)", 9324 } 9325 9326 # Prefix 9327 prefix = self.get_explode_infos_prefix() 9328 9329 # Field 9330 barcode_infos = prefix + tag 9331 9332 # Variants table 9333 table_variants = self.get_table_variants() 9334 9335 # Header 9336 vcf_reader = self.get_header() 9337 9338 # Create variant id 9339 variant_id_column = self.get_variant_id_column() 9340 added_columns = [variant_id_column] 9341 9342 # variant_id, FORMAT and samples 9343 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9344 self.get_header_sample_list() 9345 ) 9346 9347 # Create dataframe 9348 dataframe_barcode = self.get_query_to_df( 9349 f""" SELECT {samples_fields} FROM {table_variants} """ 9350 ) 9351 9352 # Create barcode column 9353 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9354 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9355 ) 9356 9357 # Add barcode to header 9358 vcf_reader.infos[tag] = vcf.parser._Info( 9359 tag, 9360 ".", 9361 "String", 9362 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9363 "howard calculation", 9364 "0", 9365 self.code_type_map.get("String"), 9366 ) 9367 9368 # Update 9369 sql_update = f""" 9370 UPDATE {table_variants} 9371 SET "INFO" = 9372 concat( 9373 CASE 9374 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9375 THEN '' 9376 ELSE concat("INFO", ';') 9377 END, 9378 CASE 9379 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9380 AND dataframe_barcode."{barcode_infos}" NOT NULL 9381 THEN concat( 9382 '{tag}=', 9383 dataframe_barcode."{barcode_infos}" 9384 ) 9385 ELSE '' 9386 END 9387 ) 9388 FROM dataframe_barcode 9389 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9390 """ 9391 self.conn.execute(sql_update) 9392 9393 # Remove added columns 9394 for added_column in added_columns: 9395 self.drop_column(column=added_column) 9396 9397 # Delete dataframe 9398 del dataframe_barcode 9399 gc.collect() 9400 9401 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9402 """ 9403 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9404 and updates the INFO field in the file with the calculated barcode values. 9405 9406 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9407 the barcode tag that will be added to the VCF file during the calculation process. If no value 9408 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9409 :type tag: str (optional) 9410 """ 9411 9412 # if FORMAT and samples 9413 if ( 9414 "FORMAT" in self.get_header_columns_as_list() 9415 and self.get_header_sample_list() 9416 ): 9417 9418 # barcode annotation field 9419 if not tag: 9420 tag = "BCF" 9421 9422 # VCF infos tags 9423 vcf_infos_tags = { 9424 tag: "barcode family calculation", 9425 f"{tag}S": "barcode family samples", 9426 } 9427 9428 # Param 9429 param = self.get_param() 9430 log.debug(f"param={param}") 9431 9432 # Prefix 9433 prefix = self.get_explode_infos_prefix() 9434 9435 # PED param 9436 ped = ( 9437 param.get("calculation", {}) 9438 .get("calculations", {}) 9439 .get("BARCODEFAMILY", {}) 9440 .get("family_pedigree", None) 9441 ) 9442 log.debug(f"ped={ped}") 9443 9444 # Load PED 9445 if ped: 9446 9447 # Pedigree is a file 9448 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9449 log.debug("Pedigree is file") 9450 with open(full_path(ped)) as ped: 9451 ped = yaml.safe_load(ped) 9452 9453 # Pedigree is a string 9454 elif isinstance(ped, str): 9455 log.debug("Pedigree is str") 9456 try: 9457 ped = json.loads(ped) 9458 log.debug("Pedigree is json str") 9459 except ValueError as e: 9460 ped_samples = ped.split(",") 9461 ped = {} 9462 for ped_sample in ped_samples: 9463 ped[ped_sample] = ped_sample 9464 9465 # Pedigree is a dict 9466 elif isinstance(ped, dict): 9467 log.debug("Pedigree is dict") 9468 9469 # Pedigree is not well formatted 9470 else: 9471 msg_error = "Pedigree not well formatted" 9472 log.error(msg_error) 9473 raise ValueError(msg_error) 9474 9475 # Construct list 9476 ped_samples = list(ped.values()) 9477 9478 else: 9479 log.debug("Pedigree not defined. Take all samples") 9480 ped_samples = self.get_header_sample_list() 9481 ped = {} 9482 for ped_sample in ped_samples: 9483 ped[ped_sample] = ped_sample 9484 9485 # Check pedigree 9486 if not ped or len(ped) == 0: 9487 msg_error = f"Error in pedigree: samples {ped_samples}" 9488 log.error(msg_error) 9489 raise ValueError(msg_error) 9490 9491 # Log 9492 log.info( 9493 "Calculation 'BARCODEFAMILY' - Samples: " 9494 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9495 ) 9496 log.debug(f"ped_samples={ped_samples}") 9497 9498 # Field 9499 barcode_infos = prefix + tag 9500 9501 # Variants table 9502 table_variants = self.get_table_variants() 9503 9504 # Header 9505 vcf_reader = self.get_header() 9506 9507 # Create variant id 9508 variant_id_column = self.get_variant_id_column() 9509 added_columns = [variant_id_column] 9510 9511 # variant_id, FORMAT and samples 9512 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9513 ped_samples 9514 ) 9515 9516 # Create dataframe 9517 dataframe_barcode = self.get_query_to_df( 9518 f""" SELECT {samples_fields} FROM {table_variants} """ 9519 ) 9520 9521 # Create barcode column 9522 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9523 lambda row: barcode(row, samples=ped_samples), axis=1 9524 ) 9525 9526 # Add barcode family to header 9527 # Add vaf_normalization to header 9528 vcf_reader.formats[tag] = vcf.parser._Format( 9529 id=tag, 9530 num=".", 9531 type="String", 9532 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9533 type_code=self.code_type_map.get("String"), 9534 ) 9535 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9536 id=f"{tag}S", 9537 num=".", 9538 type="String", 9539 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9540 type_code=self.code_type_map.get("String"), 9541 ) 9542 9543 # Update 9544 # for sample in ped_samples: 9545 sql_update_set = [] 9546 for sample in self.get_header_sample_list() + ["FORMAT"]: 9547 if sample in ped_samples: 9548 value = f'dataframe_barcode."{barcode_infos}"' 9549 value_samples = "'" + ",".join(ped_samples) + "'" 9550 elif sample == "FORMAT": 9551 value = f"'{tag}'" 9552 value_samples = f"'{tag}S'" 9553 else: 9554 value = "'.'" 9555 value_samples = "'.'" 9556 format_regex = r"[a-zA-Z0-9\s]" 9557 sql_update_set.append( 9558 f""" 9559 "{sample}" = 9560 concat( 9561 CASE 9562 WHEN {table_variants}."{sample}" = './.' 9563 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9564 ELSE {table_variants}."{sample}" 9565 END, 9566 ':', 9567 {value}, 9568 ':', 9569 {value_samples} 9570 ) 9571 """ 9572 ) 9573 9574 sql_update_set_join = ", ".join(sql_update_set) 9575 sql_update = f""" 9576 UPDATE {table_variants} 9577 SET {sql_update_set_join} 9578 FROM dataframe_barcode 9579 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9580 """ 9581 self.conn.execute(sql_update) 9582 9583 # Remove added columns 9584 for added_column in added_columns: 9585 self.drop_column(column=added_column) 9586 9587 # Delete dataframe 9588 del dataframe_barcode 9589 gc.collect() 9590 9591 def calculation_trio(self) -> None: 9592 """ 9593 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9594 information to the INFO field of each variant. 9595 """ 9596 9597 # if FORMAT and samples 9598 if ( 9599 "FORMAT" in self.get_header_columns_as_list() 9600 and self.get_header_sample_list() 9601 ): 9602 9603 # trio annotation field 9604 trio_tag = "trio" 9605 9606 # VCF infos tags 9607 vcf_infos_tags = { 9608 "trio": "trio calculation", 9609 } 9610 9611 # Param 9612 param = self.get_param() 9613 9614 # Prefix 9615 prefix = self.get_explode_infos_prefix() 9616 9617 # Trio param 9618 trio_ped = ( 9619 param.get("calculation", {}) 9620 .get("calculations", {}) 9621 .get("TRIO", {}) 9622 .get("trio_pedigree", None) 9623 ) 9624 9625 # Load trio 9626 if trio_ped: 9627 9628 # Trio pedigree is a file 9629 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9630 log.debug("TRIO pedigree is file") 9631 with open(full_path(trio_ped)) as trio_ped: 9632 trio_ped = yaml.safe_load(trio_ped) 9633 9634 # Trio pedigree is a string 9635 elif isinstance(trio_ped, str): 9636 log.debug("TRIO pedigree is str") 9637 try: 9638 trio_ped = json.loads(trio_ped) 9639 log.debug("TRIO pedigree is json str") 9640 except ValueError as e: 9641 trio_samples = trio_ped.split(",") 9642 if len(trio_samples) == 3: 9643 trio_ped = { 9644 "father": trio_samples[0], 9645 "mother": trio_samples[1], 9646 "child": trio_samples[2], 9647 } 9648 log.debug("TRIO pedigree is list str") 9649 else: 9650 msg_error = "TRIO pedigree not well formatted" 9651 log.error(msg_error) 9652 raise ValueError(msg_error) 9653 9654 # Trio pedigree is a dict 9655 elif isinstance(trio_ped, dict): 9656 log.debug("TRIO pedigree is dict") 9657 9658 # Trio pedigree is not well formatted 9659 else: 9660 msg_error = "TRIO pedigree not well formatted" 9661 log.error(msg_error) 9662 raise ValueError(msg_error) 9663 9664 # Construct trio list 9665 trio_samples = [ 9666 trio_ped.get("father", ""), 9667 trio_ped.get("mother", ""), 9668 trio_ped.get("child", ""), 9669 ] 9670 9671 else: 9672 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9673 samples_list = self.get_header_sample_list() 9674 if len(samples_list) >= 3: 9675 trio_samples = self.get_header_sample_list()[0:3] 9676 trio_ped = { 9677 "father": trio_samples[0], 9678 "mother": trio_samples[1], 9679 "child": trio_samples[2], 9680 } 9681 else: 9682 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9683 log.error(msg_error) 9684 raise ValueError(msg_error) 9685 9686 # Check trio pedigree 9687 if not trio_ped or len(trio_ped) != 3: 9688 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9689 log.error(msg_error) 9690 raise ValueError(msg_error) 9691 9692 # Log 9693 log.info( 9694 f"Calculation 'TRIO' - Samples: " 9695 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9696 ) 9697 9698 # Field 9699 trio_infos = prefix + trio_tag 9700 9701 # Variants table 9702 table_variants = self.get_table_variants() 9703 9704 # Header 9705 vcf_reader = self.get_header() 9706 9707 # Create variant id 9708 variant_id_column = self.get_variant_id_column() 9709 added_columns = [variant_id_column] 9710 9711 # variant_id, FORMAT and samples 9712 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9713 self.get_header_sample_list() 9714 ) 9715 9716 # Create dataframe 9717 dataframe_trio = self.get_query_to_df( 9718 f""" SELECT {samples_fields} FROM {table_variants} """ 9719 ) 9720 9721 # Create trio column 9722 dataframe_trio[trio_infos] = dataframe_trio.apply( 9723 lambda row: trio(row, samples=trio_samples), axis=1 9724 ) 9725 9726 # Add trio to header 9727 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9728 trio_tag, 9729 ".", 9730 "String", 9731 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9732 "howard calculation", 9733 "0", 9734 self.code_type_map.get("String"), 9735 ) 9736 9737 # Update 9738 sql_update = f""" 9739 UPDATE {table_variants} 9740 SET "INFO" = 9741 concat( 9742 CASE 9743 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9744 THEN '' 9745 ELSE concat("INFO", ';') 9746 END, 9747 CASE 9748 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9749 AND dataframe_trio."{trio_infos}" NOT NULL 9750 THEN concat( 9751 '{trio_tag}=', 9752 dataframe_trio."{trio_infos}" 9753 ) 9754 ELSE '' 9755 END 9756 ) 9757 FROM dataframe_trio 9758 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9759 """ 9760 self.conn.execute(sql_update) 9761 9762 # Remove added columns 9763 for added_column in added_columns: 9764 self.drop_column(column=added_column) 9765 9766 # Delete dataframe 9767 del dataframe_trio 9768 gc.collect() 9769 9770 def calculation_vaf_normalization(self) -> None: 9771 """ 9772 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9773 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9774 :return: The function does not return anything. 9775 """ 9776 9777 # if FORMAT and samples 9778 if ( 9779 "FORMAT" in self.get_header_columns_as_list() 9780 and self.get_header_sample_list() 9781 ): 9782 9783 # vaf_normalization annotation field 9784 vaf_normalization_tag = "VAF" 9785 9786 # VCF infos tags 9787 vcf_infos_tags = { 9788 "VAF": "VAF Variant Frequency", 9789 } 9790 9791 # Prefix 9792 prefix = self.get_explode_infos_prefix() 9793 9794 # Variants table 9795 table_variants = self.get_table_variants() 9796 9797 # Header 9798 vcf_reader = self.get_header() 9799 9800 # Do not calculate if VAF already exists 9801 if "VAF" in vcf_reader.formats: 9802 log.debug("VAF already on genotypes") 9803 return 9804 9805 # Create variant id 9806 variant_id_column = self.get_variant_id_column() 9807 added_columns = [variant_id_column] 9808 9809 # variant_id, FORMAT and samples 9810 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9811 f""" "{sample}" """ for sample in self.get_header_sample_list() 9812 ) 9813 9814 # Create dataframe 9815 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9816 log.debug(f"query={query}") 9817 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9818 9819 vaf_normalization_set = [] 9820 9821 # for each sample vaf_normalization 9822 for sample in self.get_header_sample_list(): 9823 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9824 lambda row: vaf_normalization(row, sample=sample), axis=1 9825 ) 9826 vaf_normalization_set.append( 9827 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9828 ) 9829 9830 # Add VAF to FORMAT 9831 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9832 "FORMAT" 9833 ].apply(lambda x: str(x) + ":VAF") 9834 vaf_normalization_set.append( 9835 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9836 ) 9837 9838 # Add vaf_normalization to header 9839 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9840 id=vaf_normalization_tag, 9841 num="1", 9842 type="Float", 9843 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9844 type_code=self.code_type_map.get("Float"), 9845 ) 9846 9847 # Create fields to add in INFO 9848 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9849 9850 # Update 9851 sql_update = f""" 9852 UPDATE {table_variants} 9853 SET {sql_vaf_normalization_set} 9854 FROM dataframe_vaf_normalization 9855 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9856 9857 """ 9858 self.conn.execute(sql_update) 9859 9860 # Remove added columns 9861 for added_column in added_columns: 9862 self.drop_column(column=added_column) 9863 9864 # Delete dataframe 9865 del dataframe_vaf_normalization 9866 gc.collect() 9867 9868 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9869 """ 9870 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9871 field in a VCF file and updates the INFO column of the variants table with the calculated 9872 statistics. 9873 9874 :param info: The `info` parameter is a string that represents the type of information for which 9875 genotype statistics are calculated. It is used to generate various VCF info tags for the 9876 statistics, such as the number of occurrences, the list of values, the minimum value, the 9877 maximum value, the mean, the median, defaults to VAF 9878 :type info: str (optional) 9879 """ 9880 9881 # if FORMAT and samples 9882 if ( 9883 "FORMAT" in self.get_header_columns_as_list() 9884 and self.get_header_sample_list() 9885 ): 9886 9887 # vaf_stats annotation field 9888 vaf_stats_tag = info + "_stats" 9889 9890 # VCF infos tags 9891 vcf_infos_tags = { 9892 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9893 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9894 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9895 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9896 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9897 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9898 info 9899 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9900 } 9901 9902 # Prefix 9903 prefix = self.get_explode_infos_prefix() 9904 9905 # Field 9906 vaf_stats_infos = prefix + vaf_stats_tag 9907 9908 # Variants table 9909 table_variants = self.get_table_variants() 9910 9911 # Header 9912 vcf_reader = self.get_header() 9913 9914 # Create variant id 9915 variant_id_column = self.get_variant_id_column() 9916 added_columns = [variant_id_column] 9917 9918 # variant_id, FORMAT and samples 9919 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9920 self.get_header_sample_list() 9921 ) 9922 9923 # Create dataframe 9924 dataframe_vaf_stats = self.get_query_to_df( 9925 f""" SELECT {samples_fields} FROM {table_variants} """ 9926 ) 9927 9928 # Create vaf_stats column 9929 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9930 lambda row: genotype_stats( 9931 row, samples=self.get_header_sample_list(), info=info 9932 ), 9933 axis=1, 9934 ) 9935 9936 # List of vcf tags 9937 sql_vaf_stats_fields = [] 9938 9939 # Check all VAF stats infos 9940 for stat in vcf_infos_tags: 9941 9942 # Extract stats 9943 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9944 lambda x: dict(x).get(stat, "") 9945 ) 9946 9947 # Add snpeff_hgvs to header 9948 vcf_reader.infos[stat] = vcf.parser._Info( 9949 stat, 9950 ".", 9951 "String", 9952 vcf_infos_tags.get(stat, "genotype statistics"), 9953 "howard calculation", 9954 "0", 9955 self.code_type_map.get("String"), 9956 ) 9957 9958 if len(sql_vaf_stats_fields): 9959 sep = ";" 9960 else: 9961 sep = "" 9962 9963 # Create fields to add in INFO 9964 sql_vaf_stats_fields.append( 9965 f""" 9966 CASE 9967 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9968 THEN concat( 9969 '{sep}{stat}=', 9970 dataframe_vaf_stats."{stat}" 9971 ) 9972 ELSE '' 9973 END 9974 """ 9975 ) 9976 9977 # SQL set for update 9978 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9979 9980 # Update 9981 sql_update = f""" 9982 UPDATE {table_variants} 9983 SET "INFO" = 9984 concat( 9985 CASE 9986 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9987 THEN '' 9988 ELSE concat("INFO", ';') 9989 END, 9990 {sql_vaf_stats_fields_set} 9991 ) 9992 FROM dataframe_vaf_stats 9993 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9994 9995 """ 9996 self.conn.execute(sql_update) 9997 9998 # Remove added columns 9999 for added_column in added_columns: 10000 self.drop_column(column=added_column) 10001 10002 # Delete dataframe 10003 del dataframe_vaf_stats 10004 gc.collect() 10005 10006 def calculation_transcripts_annotation( 10007 self, info_json: str = None, info_format: str = None 10008 ) -> None: 10009 """ 10010 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10011 field to it if transcripts are available. 10012 10013 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10014 is a string parameter that represents the information field to be used in the transcripts JSON. 10015 It is used to specify the JSON format for the transcripts information. If no value is provided 10016 when calling the method, it defaults to " 10017 :type info_json: str 10018 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10019 method is a string parameter that specifies the format of the information field to be used in 10020 the transcripts JSON. It is used to define the format of the information field 10021 :type info_format: str 10022 """ 10023 10024 # Create transcripts table 10025 transcripts_table = self.create_transcript_view() 10026 10027 # Add info field 10028 if transcripts_table: 10029 self.transcript_view_to_variants( 10030 transcripts_table=transcripts_table, 10031 transcripts_info_field_json=info_json, 10032 transcripts_info_field_format=info_format, 10033 ) 10034 else: 10035 log.info("No Transcripts to process. Check param.json file configuration") 10036 10037 def calculation_transcripts_prioritization(self) -> None: 10038 """ 10039 The function `calculation_transcripts_prioritization` creates a transcripts table and 10040 prioritizes transcripts based on certain criteria. 10041 """ 10042 10043 # Create transcripts table 10044 transcripts_table = self.create_transcript_view() 10045 10046 # Add info field 10047 if transcripts_table: 10048 self.transcripts_prioritization(transcripts_table=transcripts_table) 10049 else: 10050 log.info("No Transcripts to process. Check param.json file configuration") 10051 10052 def calculation_transcripts_export(self) -> None: 10053 """ """ 10054 10055 # Create transcripts table 10056 transcripts_table = self.create_transcript_view() 10057 10058 # Add info field 10059 if transcripts_table: 10060 self.transcripts_export(transcripts_table=transcripts_table) 10061 else: 10062 log.info("No Transcripts to process. Check param.json file configuration") 10063 10064 ############### 10065 # Transcripts # 10066 ############### 10067 10068 def transcripts_export( 10069 self, transcripts_table: str = None, param: dict = {} 10070 ) -> bool: 10071 """ """ 10072 10073 log.debug("Start transcripts export...") 10074 10075 # Param 10076 if not param: 10077 param = self.get_param() 10078 10079 # Param export 10080 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10081 10082 # Output file 10083 transcripts_export_output = param_transcript_export.get("output", None) 10084 10085 if not param_transcript_export or not transcripts_export_output: 10086 log.warning(f"No transcriipts export parameters defined!") 10087 return False 10088 10089 # List of transcripts annotations 10090 query_describe = f""" 10091 SELECT column_name 10092 FROM ( 10093 DESCRIBE SELECT * FROM {transcripts_table} 10094 ) 10095 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10096 """ 10097 transcripts_annotations_list = list( 10098 self.get_query_to_df(query=query_describe)["column_name"] 10099 ) 10100 10101 # Create transcripts table for export 10102 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10103 random.choices(string.ascii_uppercase + string.digits, k=10) 10104 ) 10105 query_create_transcripts_table_export = f""" 10106 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10107 """ 10108 self.execute_query(query=query_create_transcripts_table_export) 10109 10110 # Output file format 10111 transcripts_export_output_format = get_file_format( 10112 filename=transcripts_export_output 10113 ) 10114 10115 # Format VCF - construct INFO 10116 if transcripts_export_output_format in ["vcf"]: 10117 10118 # Construct query update INFO and header 10119 query_update_info = [] 10120 for field in transcripts_annotations_list: 10121 10122 # If field not in header 10123 if field not in self.get_header_infos_list(): 10124 10125 # Add PZ Transcript in header 10126 self.get_header().infos[field] = vcf.parser._Info( 10127 field, 10128 ".", 10129 "String", 10130 f"Annotation '{field}' from transcript view", 10131 "unknown", 10132 "unknown", 10133 0, 10134 ) 10135 10136 # Add field as INFO/tag 10137 query_update_info.append( 10138 f""" 10139 CASE 10140 WHEN "{field}" IS NOT NULL 10141 THEN concat('{field}=', "{field}", ';') 10142 ELSE '' 10143 END 10144 """ 10145 ) 10146 10147 # Query param 10148 query_update_info_value = ( 10149 f""" concat('', {", ".join(query_update_info)}) """ 10150 ) 10151 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10152 10153 else: 10154 10155 # Query param 10156 query_update_info_value = f""" NULL """ 10157 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10158 10159 # Update query INFO column 10160 query_update = f""" 10161 UPDATE {transcripts_table_export} 10162 SET INFO = {query_update_info_value} 10163 10164 """ 10165 self.execute_query(query=query_update) 10166 10167 # Export 10168 self.export_output( 10169 output_file=transcripts_export_output, 10170 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10171 ) 10172 10173 # Drop transcripts export table 10174 query_drop_transcripts_table_export = f""" 10175 DROP TABLE {transcripts_table_export} 10176 """ 10177 self.execute_query(query=query_drop_transcripts_table_export) 10178 10179 def transcripts_prioritization( 10180 self, transcripts_table: str = None, param: dict = {} 10181 ) -> bool: 10182 """ 10183 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10184 and updates the variants table with the prioritized information. 10185 10186 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10187 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10188 This parameter is used to identify the table where the transcripts data is stored for the 10189 prioritization process 10190 :type transcripts_table: str 10191 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10192 that contains various configuration settings for the prioritization process of transcripts. It 10193 is used to customize the behavior of the prioritization algorithm and includes settings such as 10194 the prefix for prioritization fields, default profiles, and other 10195 :type param: dict 10196 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10197 transcripts prioritization process is successfully completed, and `False` if there are any 10198 issues or if no profile is defined for transcripts prioritization. 10199 """ 10200 10201 log.debug("Start transcripts prioritization...") 10202 10203 # Param 10204 if not param: 10205 param = self.get_param() 10206 10207 # Variants table 10208 table_variants = self.get_table_variants() 10209 10210 # Transcripts table 10211 if transcripts_table is None: 10212 transcripts_table = self.create_transcript_view( 10213 transcripts_table="transcripts", param=param 10214 ) 10215 if transcripts_table is None: 10216 msg_err = "No Transcripts table availalble" 10217 log.error(msg_err) 10218 raise ValueError(msg_err) 10219 log.debug(f"transcripts_table={transcripts_table}") 10220 10221 # Get transcripts columns 10222 columns_as_list_query = f""" 10223 DESCRIBE {transcripts_table} 10224 """ 10225 columns_as_list = list( 10226 self.get_query_to_df(columns_as_list_query)["column_name"] 10227 ) 10228 10229 # Create INFO if not exists 10230 if "INFO" not in columns_as_list: 10231 query_add_info = f""" 10232 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10233 """ 10234 self.execute_query(query_add_info) 10235 10236 # Prioritization param and Force only PZ Score and Flag 10237 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10238 10239 # PZ profile by default 10240 pz_profile_default = ( 10241 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10242 ) 10243 10244 # Exit if no profile 10245 if pz_profile_default is None: 10246 log.warning("No profile defined for transcripts prioritization") 10247 return False 10248 10249 # PZ fields 10250 pz_param_pzfields = {} 10251 10252 # PZ field transcripts 10253 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10254 10255 # Add PZ Transcript in header 10256 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10257 pz_fields_transcripts, 10258 ".", 10259 "String", 10260 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10261 "unknown", 10262 "unknown", 10263 code_type_map["String"], 10264 ) 10265 10266 # Mandatory fields 10267 pz_mandatory_fields_list = [ 10268 "Score", 10269 "Flag", 10270 "Tags", 10271 "Comment", 10272 "Infos", 10273 "Class", 10274 ] 10275 pz_mandatory_fields = [] 10276 for pz_mandatory_field in pz_mandatory_fields_list: 10277 pz_mandatory_fields.append( 10278 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10279 ) 10280 10281 # PZ fields in param 10282 for pz_field in pz_param.get("pzfields", []): 10283 if pz_field in pz_mandatory_fields_list: 10284 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10285 pz_param.get("pzprefix", "PTZ") + pz_field 10286 ) 10287 else: 10288 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10289 pz_param_pzfields[pz_field] = pz_field_new 10290 10291 # Add PZ Transcript in header 10292 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10293 pz_field_new, 10294 ".", 10295 "String", 10296 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10297 "unknown", 10298 "unknown", 10299 code_type_map["String"], 10300 ) 10301 10302 # PZ fields param 10303 pz_param["pzfields"] = pz_mandatory_fields 10304 10305 # Prioritization 10306 prioritization_result = self.prioritization( 10307 table=transcripts_table, 10308 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10309 ) 10310 if not prioritization_result: 10311 log.warning("Transcripts prioritization not processed") 10312 return False 10313 10314 # PZ fields sql query 10315 query_update_select_list = [] 10316 query_update_concat_list = [] 10317 query_update_order_list = [] 10318 for pz_param_pzfield in set( 10319 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10320 ): 10321 query_update_select_list.append(f" {pz_param_pzfield}, ") 10322 10323 for pz_param_pzfield in pz_param_pzfields: 10324 query_update_concat_list.append( 10325 f""" 10326 , CASE 10327 WHEN {pz_param_pzfield} IS NOT NULL 10328 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10329 ELSE '' 10330 END 10331 """ 10332 ) 10333 10334 # Order by 10335 pz_orders = ( 10336 param.get("transcripts", {}) 10337 .get("prioritization", {}) 10338 .get("prioritization_transcripts_order", {}) 10339 ) 10340 if not pz_orders: 10341 pz_orders = { 10342 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10343 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10344 } 10345 for pz_order in pz_orders: 10346 query_update_order_list.append( 10347 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10348 ) 10349 10350 # Fields to explode 10351 fields_to_explode = ( 10352 list(pz_param_pzfields.keys()) 10353 + pz_mandatory_fields 10354 + list(pz_orders.keys()) 10355 ) 10356 # Remove transcript column as a specific transcript column 10357 if "transcript" in fields_to_explode: 10358 fields_to_explode.remove("transcript") 10359 10360 # Fields intranscripts table 10361 query_transcripts_table = f""" 10362 DESCRIBE SELECT * FROM {transcripts_table} 10363 """ 10364 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10365 10366 # Check fields to explode 10367 for field_to_explode in fields_to_explode: 10368 if field_to_explode not in self.get_header_infos_list() + list( 10369 query_transcripts_table.column_name 10370 ): 10371 msg_err = f"INFO/{field_to_explode} NOT IN header" 10372 log.error(msg_err) 10373 raise ValueError(msg_err) 10374 10375 # Explode fields to explode 10376 self.explode_infos( 10377 table=transcripts_table, 10378 fields=fields_to_explode, 10379 ) 10380 10381 # Transcript preference file 10382 transcripts_preference_file = ( 10383 param.get("transcripts", {}) 10384 .get("prioritization", {}) 10385 .get("prioritization_transcripts", {}) 10386 ) 10387 transcripts_preference_file = full_path(transcripts_preference_file) 10388 10389 # Transcript preference forced 10390 transcript_preference_force = ( 10391 param.get("transcripts", {}) 10392 .get("prioritization", {}) 10393 .get("prioritization_transcripts_force", False) 10394 ) 10395 # Transcript version forced 10396 transcript_version_force = ( 10397 param.get("transcripts", {}) 10398 .get("prioritization", {}) 10399 .get("prioritization_transcripts_version_force", False) 10400 ) 10401 10402 # Transcripts Ranking 10403 if transcripts_preference_file: 10404 10405 # Transcripts file to dataframe 10406 if os.path.exists(transcripts_preference_file): 10407 transcripts_preference_dataframe = transcripts_file_to_df( 10408 transcripts_preference_file 10409 ) 10410 else: 10411 log.error( 10412 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10413 ) 10414 raise ValueError( 10415 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10416 ) 10417 10418 # Order by depending to transcript preference forcing 10419 if transcript_preference_force: 10420 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10421 else: 10422 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10423 10424 # Transcript columns joined depend on version consideration 10425 if transcript_version_force: 10426 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10427 else: 10428 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10429 10430 # Query ranking for update 10431 query_update_ranking = f""" 10432 SELECT 10433 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10434 ROW_NUMBER() OVER ( 10435 PARTITION BY "#CHROM", POS, REF, ALT 10436 ORDER BY {order_by} 10437 ) AS rn 10438 FROM {transcripts_table} 10439 LEFT JOIN 10440 ( 10441 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10442 FROM transcripts_preference_dataframe 10443 ) AS transcripts_preference 10444 ON {transcripts_version_join} 10445 """ 10446 10447 else: 10448 10449 # Query ranking for update 10450 query_update_ranking = f""" 10451 SELECT 10452 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10453 ROW_NUMBER() OVER ( 10454 PARTITION BY "#CHROM", POS, REF, ALT 10455 ORDER BY {" , ".join(query_update_order_list)} 10456 ) AS rn 10457 FROM {transcripts_table} 10458 """ 10459 10460 # Export Transcripts prioritization infos to variants table 10461 query_update = f""" 10462 WITH RankedTranscripts AS ( 10463 {query_update_ranking} 10464 ) 10465 UPDATE {table_variants} 10466 SET 10467 INFO = CONCAT(CASE 10468 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10469 THEN '' 10470 ELSE concat("INFO", ';') 10471 END, 10472 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10473 ) 10474 FROM 10475 RankedTranscripts 10476 WHERE 10477 rn = 1 10478 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10479 AND variants."POS" = RankedTranscripts."POS" 10480 AND variants."REF" = RankedTranscripts."REF" 10481 AND variants."ALT" = RankedTranscripts."ALT" 10482 """ 10483 10484 # log.debug(f"query_update={query_update}") 10485 self.execute_query(query=query_update) 10486 10487 # Return 10488 return True 10489 10490 def create_transcript_view_from_columns_map( 10491 self, 10492 transcripts_table: str = "transcripts", 10493 columns_maps: dict = {}, 10494 added_columns: list = [], 10495 temporary_tables: list = None, 10496 annotation_fields: list = None, 10497 column_rename: dict = {}, 10498 column_clean: bool = False, 10499 column_case: str = None, 10500 ) -> tuple[list, list, list]: 10501 """ 10502 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10503 specified columns mapping for transcripts data. 10504 10505 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10506 of the table where the transcripts data is stored or will be stored in the database. This table 10507 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10508 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10509 :type transcripts_table: str (optional) 10510 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10511 about how to map columns from a transcripts table to create a view. Each entry in the 10512 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10513 typically includes details such as the main transcript column and additional information columns 10514 :type columns_maps: dict 10515 :param added_columns: The `added_columns` parameter in the 10516 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10517 that will be added to the view being created based on the columns map provided. These columns 10518 are generated by exploding the transcript information columns along with the main transcript 10519 column 10520 :type added_columns: list 10521 :param temporary_tables: The `temporary_tables` parameter in the 10522 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10523 tables created during the process of creating a transcript view from a columns map. These 10524 temporary tables are used to store intermediate results or transformations before the final view 10525 is generated 10526 :type temporary_tables: list 10527 :param annotation_fields: The `annotation_fields` parameter in the 10528 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10529 used for annotation in the query view creation process. These fields are extracted from the 10530 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10531 :type annotation_fields: list 10532 :param column_rename: The `column_rename` parameter in the 10533 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10534 custom renaming for columns during the creation of the temporary table view. This parameter 10535 provides a mapping of original column names to the desired renamed column names. By using this 10536 parameter, 10537 :type column_rename: dict 10538 :param column_clean: The `column_clean` parameter in the 10539 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10540 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10541 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10542 False 10543 :type column_clean: bool (optional) 10544 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10545 function is used to specify the case transformation to be applied to the columns during the view 10546 creation process. It allows you to control whether the column values should be converted to 10547 lowercase, uppercase, or remain unchanged 10548 :type column_case: str 10549 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10550 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10551 """ 10552 10553 log.debug("Start transcrpts view creation from columns map...") 10554 10555 # "from_columns_map": [ 10556 # { 10557 # "transcripts_column": "Ensembl_transcriptid", 10558 # "transcripts_infos_columns": [ 10559 # "genename", 10560 # "Ensembl_geneid", 10561 # "LIST_S2_score", 10562 # "LIST_S2_pred", 10563 # ], 10564 # }, 10565 # { 10566 # "transcripts_column": "Ensembl_transcriptid", 10567 # "transcripts_infos_columns": [ 10568 # "genename", 10569 # "VARITY_R_score", 10570 # "Aloft_pred", 10571 # ], 10572 # }, 10573 # ], 10574 10575 # Init 10576 if temporary_tables is None: 10577 temporary_tables = [] 10578 if annotation_fields is None: 10579 annotation_fields = [] 10580 10581 # Variants table 10582 table_variants = self.get_table_variants() 10583 10584 for columns_map in columns_maps: 10585 10586 # Transcript column 10587 transcripts_column = columns_map.get("transcripts_column", None) 10588 10589 # Transcripts infos columns 10590 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10591 10592 # Transcripts infos columns rename 10593 column_rename = columns_map.get("column_rename", column_rename) 10594 10595 # Transcripts infos columns clean 10596 column_clean = columns_map.get("column_clean", column_clean) 10597 10598 # Transcripts infos columns case 10599 column_case = columns_map.get("column_case", column_case) 10600 10601 if transcripts_column is not None: 10602 10603 # Explode 10604 added_columns += self.explode_infos( 10605 fields=[transcripts_column] + transcripts_infos_columns 10606 ) 10607 10608 # View clauses 10609 clause_select_variants = [] 10610 clause_select_tanscripts = [] 10611 for field in [transcripts_column] + transcripts_infos_columns: 10612 10613 # AS field 10614 as_field = field 10615 10616 # Rename 10617 if column_rename: 10618 as_field = column_rename.get(as_field, as_field) 10619 10620 # Clean 10621 if column_clean: 10622 as_field = clean_annotation_field(as_field) 10623 10624 # Case 10625 if column_case: 10626 if column_case.lower() in ["lower"]: 10627 as_field = as_field.lower() 10628 elif column_case.lower() in ["upper"]: 10629 as_field = as_field.upper() 10630 10631 # Clause select Variants 10632 clause_select_variants.append( 10633 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10634 ) 10635 10636 if field in [transcripts_column]: 10637 clause_select_tanscripts.append( 10638 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10639 ) 10640 else: 10641 clause_select_tanscripts.append( 10642 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10643 ) 10644 annotation_fields.append(as_field) 10645 10646 # Querey View 10647 query = f""" 10648 SELECT 10649 "#CHROM", POS, REF, ALT, INFO, 10650 "{transcripts_column}" AS 'transcript', 10651 {", ".join(clause_select_tanscripts)} 10652 FROM ( 10653 SELECT 10654 "#CHROM", POS, REF, ALT, INFO, 10655 {", ".join(clause_select_variants)} 10656 FROM {table_variants} 10657 ) 10658 WHERE "{transcripts_column}" IS NOT NULL 10659 """ 10660 10661 # Create temporary table 10662 temporary_table = transcripts_table + "".join( 10663 random.choices(string.ascii_uppercase + string.digits, k=10) 10664 ) 10665 10666 # Temporary_tables 10667 temporary_tables.append(temporary_table) 10668 query_view = f""" 10669 CREATE TEMPORARY TABLE {temporary_table} 10670 AS ({query}) 10671 """ 10672 self.execute_query(query=query_view) 10673 10674 return added_columns, temporary_tables, annotation_fields 10675 10676 def create_transcript_view_from_column_format( 10677 self, 10678 transcripts_table: str = "transcripts", 10679 column_formats: dict = {}, 10680 temporary_tables: list = None, 10681 annotation_fields: list = None, 10682 column_rename: dict = {}, 10683 column_clean: bool = False, 10684 column_case: str = None, 10685 ) -> tuple[list, list, list]: 10686 """ 10687 The `create_transcript_view_from_column_format` function generates a transcript view based on 10688 specified column formats, adds additional columns and annotation fields, and returns the list of 10689 temporary tables and annotation fields. 10690 10691 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10692 of the table containing the transcripts data. This table will be used as the base table for 10693 creating the transcript view. The default value for this parameter is "transcripts", but you can 10694 provide a different table name if needed, defaults to transcripts 10695 :type transcripts_table: str (optional) 10696 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10697 about the columns to be used for creating the transcript view. Each entry in the dictionary 10698 specifies the mapping between a transcripts column and a transcripts infos column. This 10699 parameter allows you to define how the columns from the transcripts table should be transformed 10700 or mapped 10701 :type column_formats: dict 10702 :param temporary_tables: The `temporary_tables` parameter in the 10703 `create_transcript_view_from_column_format` function is a list that stores the names of 10704 temporary views created during the process of creating a transcript view from a column format. 10705 These temporary views are used to manipulate and extract data before generating the final 10706 transcript view 10707 :type temporary_tables: list 10708 :param annotation_fields: The `annotation_fields` parameter in the 10709 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10710 that are extracted from the temporary views created during the process. These annotation fields 10711 are obtained by querying the temporary views and extracting the column names excluding specific 10712 columns like `#CH 10713 :type annotation_fields: list 10714 :param column_rename: The `column_rename` parameter in the 10715 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10716 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10717 column names to new column names in this dictionary, you can rename specific columns during the 10718 process 10719 :type column_rename: dict 10720 :param column_clean: The `column_clean` parameter in the 10721 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10722 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10723 will be cleaned during the creation of the transcript view based on the specified column format, 10724 defaults to False 10725 :type column_clean: bool (optional) 10726 :param column_case: The `column_case` parameter in the 10727 `create_transcript_view_from_column_format` function is used to specify the case transformation 10728 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10729 to convert the column names to uppercase or lowercase, respectively 10730 :type column_case: str 10731 :return: The `create_transcript_view_from_column_format` function returns two lists: 10732 `temporary_tables` and `annotation_fields`. 10733 """ 10734 10735 log.debug("Start transcrpts view creation from column format...") 10736 10737 # "from_column_format": [ 10738 # { 10739 # "transcripts_column": "ANN", 10740 # "transcripts_infos_column": "Feature_ID", 10741 # } 10742 # ], 10743 10744 # Init 10745 if temporary_tables is None: 10746 temporary_tables = [] 10747 if annotation_fields is None: 10748 annotation_fields = [] 10749 10750 for column_format in column_formats: 10751 10752 # annotation field and transcript annotation field 10753 annotation_field = column_format.get("transcripts_column", "ANN") 10754 transcript_annotation = column_format.get( 10755 "transcripts_infos_column", "Feature_ID" 10756 ) 10757 10758 # Transcripts infos columns rename 10759 column_rename = column_format.get("column_rename", column_rename) 10760 10761 # Transcripts infos columns clean 10762 column_clean = column_format.get("column_clean", column_clean) 10763 10764 # Transcripts infos columns case 10765 column_case = column_format.get("column_case", column_case) 10766 10767 # Temporary View name 10768 temporary_view_name = transcripts_table + "".join( 10769 random.choices(string.ascii_uppercase + string.digits, k=10) 10770 ) 10771 10772 # Create temporary view name 10773 temporary_view_name = self.annotation_format_to_table( 10774 uniquify=True, 10775 annotation_field=annotation_field, 10776 view_name=temporary_view_name, 10777 annotation_id=transcript_annotation, 10778 column_rename=column_rename, 10779 column_clean=column_clean, 10780 column_case=column_case, 10781 ) 10782 10783 # Annotation fields 10784 if temporary_view_name: 10785 query_annotation_fields = f""" 10786 SELECT * 10787 FROM ( 10788 DESCRIBE SELECT * 10789 FROM {temporary_view_name} 10790 ) 10791 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10792 """ 10793 df_annotation_fields = self.get_query_to_df( 10794 query=query_annotation_fields 10795 ) 10796 10797 # Add temporary view and annotation fields 10798 temporary_tables.append(temporary_view_name) 10799 annotation_fields += list(set(df_annotation_fields["column_name"])) 10800 10801 return temporary_tables, annotation_fields 10802 10803 def create_transcript_view( 10804 self, 10805 transcripts_table: str = None, 10806 transcripts_table_drop: bool = False, 10807 param: dict = {}, 10808 ) -> str: 10809 """ 10810 The `create_transcript_view` function generates a transcript view by processing data from a 10811 specified table based on provided parameters and structural information. 10812 10813 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10814 is used to specify the name of the table that will store the final transcript view data. If a table 10815 name is not provided, the function will create a new table to store the transcript view data, and by 10816 default,, defaults to transcripts 10817 :type transcripts_table: str (optional) 10818 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10819 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10820 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10821 the function will drop the existing transcripts table if it exists, defaults to False 10822 :type transcripts_table_drop: bool (optional) 10823 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10824 contains information needed to create a transcript view. It includes details such as the structure 10825 of the transcripts, columns mapping, column formats, and other necessary information for generating 10826 the view. This parameter allows for flexibility and customization 10827 :type param: dict 10828 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10829 created or modified during the execution of the function. 10830 """ 10831 10832 log.debug("Start transcripts view creation...") 10833 10834 # Default 10835 transcripts_table_default = "transcripts" 10836 10837 # Param 10838 if not param: 10839 param = self.get_param() 10840 10841 # Struct 10842 struct = param.get("transcripts", {}).get("struct", None) 10843 10844 # Transcript veresion 10845 transcript_id_remove_version = param.get("transcripts", {}).get( 10846 "transcript_id_remove_version", False 10847 ) 10848 10849 # Transcripts mapping 10850 transcript_id_mapping_file = param.get("transcripts", {}).get( 10851 "transcript_id_mapping_file", None 10852 ) 10853 10854 # Transcripts mapping 10855 transcript_id_mapping_force = param.get("transcripts", {}).get( 10856 "transcript_id_mapping_force", None 10857 ) 10858 10859 if struct: 10860 10861 # Transcripts table 10862 if transcripts_table is None: 10863 transcripts_table = param.get("transcripts", {}).get( 10864 "table", transcripts_table_default 10865 ) 10866 10867 # added_columns 10868 added_columns = [] 10869 10870 # Temporary tables 10871 temporary_tables = [] 10872 10873 # Annotation fields 10874 annotation_fields = [] 10875 10876 # from columns map 10877 columns_maps = struct.get("from_columns_map", []) 10878 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10879 self.create_transcript_view_from_columns_map( 10880 transcripts_table=transcripts_table, 10881 columns_maps=columns_maps, 10882 added_columns=added_columns, 10883 temporary_tables=temporary_tables, 10884 annotation_fields=annotation_fields, 10885 ) 10886 ) 10887 added_columns += added_columns_tmp 10888 temporary_tables += temporary_tables_tmp 10889 annotation_fields += annotation_fields_tmp 10890 10891 # from column format 10892 column_formats = struct.get("from_column_format", []) 10893 temporary_tables_tmp, annotation_fields_tmp = ( 10894 self.create_transcript_view_from_column_format( 10895 transcripts_table=transcripts_table, 10896 column_formats=column_formats, 10897 temporary_tables=temporary_tables, 10898 annotation_fields=annotation_fields, 10899 ) 10900 ) 10901 temporary_tables += temporary_tables_tmp 10902 annotation_fields += annotation_fields_tmp 10903 10904 # Remove some specific fields/column 10905 annotation_fields = list(set(annotation_fields)) 10906 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10907 if field in annotation_fields: 10908 annotation_fields.remove(field) 10909 10910 # Merge temporary tables query 10911 query_merge = "" 10912 for temporary_table in list(set(temporary_tables)): 10913 10914 # First temporary table 10915 if not query_merge: 10916 query_merge = f""" 10917 SELECT * FROM {temporary_table} 10918 """ 10919 # other temporary table (using UNION) 10920 else: 10921 query_merge += f""" 10922 UNION BY NAME SELECT * FROM {temporary_table} 10923 """ 10924 10925 # transcript table tmp 10926 transcript_table_tmp = "transcripts_tmp" 10927 transcript_table_tmp2 = "transcripts_tmp2" 10928 transcript_table_tmp3 = "transcripts_tmp3" 10929 10930 # Merge on transcript 10931 query_merge_on_transcripts_annotation_fields = [] 10932 10933 # Add transcript list 10934 query_merge_on_transcripts_annotation_fields.append( 10935 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10936 ) 10937 10938 # Aggregate all annotations fields 10939 for annotation_field in set(annotation_fields): 10940 query_merge_on_transcripts_annotation_fields.append( 10941 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10942 ) 10943 10944 # Transcripts mapping 10945 if transcript_id_mapping_file: 10946 10947 # Transcript dataframe 10948 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10949 transcript_id_mapping_dataframe = transcripts_file_to_df( 10950 transcript_id_mapping_file, column_names=["transcript", "alias"] 10951 ) 10952 10953 # Transcript version remove 10954 if transcript_id_remove_version: 10955 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10956 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10957 query_left_join = f""" 10958 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10959 """ 10960 else: 10961 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10962 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10963 query_left_join = f""" 10964 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10965 """ 10966 10967 # Transcript column for group by merge 10968 query_transcript_merge_group_by = """ 10969 CASE 10970 WHEN transcript_mapped NOT IN ('') 10971 THEN split_part(transcript_mapped, '.', 1) 10972 ELSE split_part(transcript_original, '.', 1) 10973 END 10974 """ 10975 10976 # Merge query 10977 transcripts_tmp2_query = f""" 10978 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10979 FROM ({query_merge}) AS {transcript_table_tmp} 10980 {query_left_join} 10981 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10982 """ 10983 10984 # Retrive columns after mege 10985 transcripts_tmp2_describe_query = f""" 10986 DESCRIBE {transcripts_tmp2_query} 10987 """ 10988 transcripts_tmp2_describe_list = list( 10989 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10990 "column_name" 10991 ] 10992 ) 10993 10994 # Create list of columns for select clause 10995 transcripts_tmp2_describe_select_clause = [] 10996 for field in transcripts_tmp2_describe_list: 10997 if field not in [ 10998 "#CHROM", 10999 "POS", 11000 "REF", 11001 "ALT", 11002 "INFO", 11003 "transcript_mapped", 11004 ]: 11005 as_field = field 11006 if field in ["transcript_original"]: 11007 as_field = "transcripts_mapped" 11008 transcripts_tmp2_describe_select_clause.append( 11009 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11010 ) 11011 11012 # Merge with mapping 11013 query_merge_on_transcripts = f""" 11014 SELECT 11015 "#CHROM", POS, REF, ALT, INFO, 11016 CASE 11017 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11018 THEN ANY_VALUE(transcript_mapped) 11019 ELSE ANY_VALUE(transcript_original) 11020 END AS transcript, 11021 {", ".join(transcripts_tmp2_describe_select_clause)} 11022 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11023 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11024 {query_transcript_merge_group_by} 11025 """ 11026 11027 # Add transcript filter from mapping file 11028 if transcript_id_mapping_force: 11029 query_merge_on_transcripts = f""" 11030 SELECT * 11031 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11032 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11033 """ 11034 11035 # No transcript mapping 11036 else: 11037 11038 # Remove transcript version 11039 if transcript_id_remove_version: 11040 query_transcript_column = f""" 11041 split_part({transcript_table_tmp}.transcript, '.', 1) 11042 """ 11043 else: 11044 query_transcript_column = """ 11045 transcript 11046 """ 11047 11048 # Query sections 11049 query_transcript_column_select = ( 11050 f"{query_transcript_column} AS transcript" 11051 ) 11052 query_transcript_column_group_by = query_transcript_column 11053 11054 # Query for transcripts view 11055 query_merge_on_transcripts = f""" 11056 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11057 FROM ({query_merge}) AS {transcript_table_tmp} 11058 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11059 """ 11060 11061 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11062 11063 # Drop transcript view is necessary 11064 if transcripts_table_drop: 11065 query_drop = f""" 11066 DROP TABLE IF EXISTS {transcripts_table}; 11067 """ 11068 self.execute_query(query=query_drop) 11069 11070 # Merge and create transcript view 11071 query_create_view = f""" 11072 CREATE TABLE IF NOT EXISTS {transcripts_table} 11073 AS {query_merge_on_transcripts} 11074 """ 11075 self.execute_query(query=query_create_view) 11076 11077 # Remove added columns 11078 for added_column in added_columns: 11079 self.drop_column(column=added_column) 11080 11081 else: 11082 11083 transcripts_table = None 11084 11085 return transcripts_table 11086 11087 def annotation_format_to_table( 11088 self, 11089 uniquify: bool = True, 11090 annotation_field: str = "ANN", 11091 annotation_id: str = "Feature_ID", 11092 view_name: str = "transcripts", 11093 column_rename: dict = {}, 11094 column_clean: bool = False, 11095 column_case: str = None, 11096 ) -> str: 11097 """ 11098 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11099 structured table format, ensuring unique values and creating a temporary table for further 11100 processing or analysis. 11101 11102 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11103 unique values in the output or not. If set to `True`, the function will make sure that the 11104 output values are unique, defaults to True 11105 :type uniquify: bool (optional) 11106 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11107 that contains the annotation information for each variant. This field is used to extract the 11108 annotation details for further processing in the function. By default, it is set to "ANN", 11109 defaults to ANN 11110 :type annotation_field: str (optional) 11111 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11112 is used to specify the identifier for the annotation feature. This identifier will be used as a 11113 column name in the resulting table or view that is created based on the annotation data. It 11114 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11115 :type annotation_id: str (optional) 11116 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11117 to specify the name of the temporary table that will be created to store the transformed 11118 annotation data. This table will hold the extracted information from the annotation field in a 11119 structured format for further processing or analysis. By default,, defaults to transcripts 11120 :type view_name: str (optional) 11121 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11122 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11123 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11124 created based on the annotation data. This feature enables 11125 :type column_rename: dict 11126 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11127 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11128 If set to `True`, the function will clean the annotation field before further processing. This 11129 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11130 to False 11131 :type column_clean: bool (optional) 11132 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11133 used to specify the case transformation to be applied to the column names extracted from the 11134 annotation data. It allows you to set the case of the column names to either lowercase or 11135 uppercase for consistency or other specific requirements during the conversion 11136 :type column_case: str 11137 :return: The function `annotation_format_to_table` is returning the name of the view created, 11138 which is stored in the variable `view_name`. 11139 """ 11140 11141 # Annotation field 11142 annotation_format = "annotation_explode" 11143 11144 # Transcript annotation 11145 if column_rename: 11146 annotation_id = column_rename.get(annotation_id, annotation_id) 11147 11148 if column_clean: 11149 annotation_id = clean_annotation_field(annotation_id) 11150 11151 # Prefix 11152 prefix = self.get_explode_infos_prefix() 11153 if prefix: 11154 prefix = "INFO/" 11155 11156 # Annotation fields 11157 annotation_infos = prefix + annotation_field 11158 annotation_format_infos = prefix + annotation_format 11159 11160 # Variants table 11161 table_variants = self.get_table_variants() 11162 11163 # Header 11164 vcf_reader = self.get_header() 11165 11166 # Add columns 11167 added_columns = [] 11168 11169 # Explode HGVS field in column 11170 added_columns += self.explode_infos(fields=[annotation_field]) 11171 11172 if annotation_field in vcf_reader.infos: 11173 11174 # Extract ANN header 11175 ann_description = vcf_reader.infos[annotation_field].desc 11176 pattern = r"'(.+?)'" 11177 match = re.search(pattern, ann_description) 11178 if match: 11179 ann_header_match = match.group(1).split(" | ") 11180 ann_header = [] 11181 ann_header_desc = {} 11182 for i in range(len(ann_header_match)): 11183 ann_header_info = "".join( 11184 char for char in ann_header_match[i] if char.isalnum() 11185 ) 11186 ann_header.append(ann_header_info) 11187 ann_header_desc[ann_header_info] = ann_header_match[i] 11188 if not ann_header_desc: 11189 raise ValueError("Invalid header description format") 11190 else: 11191 raise ValueError("Invalid header description format") 11192 11193 # Create variant id 11194 variant_id_column = self.get_variant_id_column() 11195 added_columns += [variant_id_column] 11196 11197 # Create dataframe 11198 dataframe_annotation_format = self.get_query_to_df( 11199 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11200 ) 11201 11202 # Create annotation columns 11203 dataframe_annotation_format[ 11204 annotation_format_infos 11205 ] = dataframe_annotation_format[annotation_infos].apply( 11206 lambda x: explode_annotation_format( 11207 annotation=str(x), 11208 uniquify=uniquify, 11209 output_format="JSON", 11210 prefix="", 11211 header=list(ann_header_desc.values()), 11212 ) 11213 ) 11214 11215 # Find keys 11216 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11217 df_keys = self.get_query_to_df(query=query_json) 11218 11219 # Check keys 11220 query_json_key = [] 11221 for _, row in df_keys.iterrows(): 11222 11223 # Key 11224 key = row.iloc[0] 11225 key_clean = key 11226 11227 # key rename 11228 if column_rename: 11229 key_clean = column_rename.get(key_clean, key_clean) 11230 11231 # key clean 11232 if column_clean: 11233 key_clean = clean_annotation_field(key_clean) 11234 11235 # Key case 11236 if column_case: 11237 if column_case.lower() in ["lower"]: 11238 key_clean = key_clean.lower() 11239 elif column_case.lower() in ["upper"]: 11240 key_clean = key_clean.upper() 11241 11242 # Type 11243 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11244 11245 # Get DataFrame from query 11246 df_json_type = self.get_query_to_df(query=query_json_type) 11247 11248 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11249 with pd.option_context("future.no_silent_downcasting", True): 11250 df_json_type.fillna(value="", inplace=True) 11251 replace_dict = {None: np.nan, "": np.nan} 11252 df_json_type.replace(replace_dict, inplace=True) 11253 df_json_type.dropna(inplace=True) 11254 11255 # Detect column type 11256 column_type = detect_column_type(df_json_type[key_clean]) 11257 11258 # Append 11259 query_json_key.append( 11260 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11261 ) 11262 11263 # Create view 11264 query_view = f""" 11265 CREATE TEMPORARY TABLE {view_name} 11266 AS ( 11267 SELECT *, {annotation_id} AS 'transcript' 11268 FROM ( 11269 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11270 FROM dataframe_annotation_format 11271 ) 11272 ); 11273 """ 11274 self.execute_query(query=query_view) 11275 11276 else: 11277 11278 # Return None 11279 view_name = None 11280 11281 # Remove added columns 11282 for added_column in added_columns: 11283 self.drop_column(column=added_column) 11284 11285 return view_name 11286 11287 def transcript_view_to_variants( 11288 self, 11289 transcripts_table: str = None, 11290 transcripts_column_id: str = None, 11291 transcripts_info_json: str = None, 11292 transcripts_info_field_json: str = None, 11293 transcripts_info_format: str = None, 11294 transcripts_info_field_format: str = None, 11295 param: dict = {}, 11296 ) -> bool: 11297 """ 11298 The `transcript_view_to_variants` function updates a variants table with information from 11299 transcripts in JSON format. 11300 11301 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11302 table containing the transcripts data. If this parameter is not provided, the function will 11303 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11304 :type transcripts_table: str 11305 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11306 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11307 identifier is used to match transcripts with variants in the database 11308 :type transcripts_column_id: str 11309 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11310 of the column in the variants table where the transcripts information will be stored in JSON 11311 format. This parameter allows you to define the column in the variants table that will hold the 11312 JSON-formatted information about transcripts 11313 :type transcripts_info_json: str 11314 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11315 specify the field in the VCF header that will contain information about transcripts in JSON 11316 format. This field will be added to the VCF header as an INFO field with the specified name 11317 :type transcripts_info_field_json: str 11318 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11319 format of the information about transcripts that will be stored in the variants table. This 11320 format can be used to define how the transcript information will be structured or displayed 11321 within the variants table 11322 :type transcripts_info_format: str 11323 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11324 specify the field in the VCF header that will contain information about transcripts in a 11325 specific format. This field will be added to the VCF header as an INFO field with the specified 11326 name 11327 :type transcripts_info_field_format: str 11328 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11329 that contains various configuration settings related to transcripts. It is used to provide 11330 default values for certain parameters if they are not explicitly provided when calling the 11331 method. The `param` dictionary can be passed as an argument 11332 :type param: dict 11333 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11334 if the operation is successful and `False` if certain conditions are not met. 11335 """ 11336 11337 msg_info_prefix = "Start transcripts view to variants annotations" 11338 11339 log.debug(f"{msg_info_prefix}...") 11340 11341 # Default 11342 transcripts_table_default = "transcripts" 11343 transcripts_column_id_default = "transcript" 11344 transcripts_info_json_default = None 11345 transcripts_info_format_default = None 11346 transcripts_info_field_json_default = None 11347 transcripts_info_field_format_default = None 11348 11349 # Param 11350 if not param: 11351 param = self.get_param() 11352 11353 # Transcripts table 11354 if transcripts_table is None: 11355 transcripts_table = param.get("transcripts", {}).get( 11356 "table", transcripts_table_default 11357 ) 11358 11359 # Transcripts column ID 11360 if transcripts_column_id is None: 11361 transcripts_column_id = param.get("transcripts", {}).get( 11362 "column_id", transcripts_column_id_default 11363 ) 11364 11365 # Transcripts info json 11366 if transcripts_info_json is None: 11367 transcripts_info_json = param.get("transcripts", {}).get( 11368 "transcripts_info_json", transcripts_info_json_default 11369 ) 11370 11371 # Transcripts info field JSON 11372 if transcripts_info_field_json is None: 11373 transcripts_info_field_json = param.get("transcripts", {}).get( 11374 "transcripts_info_field_json", transcripts_info_field_json_default 11375 ) 11376 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11377 # transcripts_info_json = transcripts_info_field_json 11378 11379 # Transcripts info format 11380 if transcripts_info_format is None: 11381 transcripts_info_format = param.get("transcripts", {}).get( 11382 "transcripts_info_format", transcripts_info_format_default 11383 ) 11384 11385 # Transcripts info field FORMAT 11386 if transcripts_info_field_format is None: 11387 transcripts_info_field_format = param.get("transcripts", {}).get( 11388 "transcripts_info_field_format", transcripts_info_field_format_default 11389 ) 11390 # if ( 11391 # transcripts_info_field_format is not None 11392 # and transcripts_info_format is None 11393 # ): 11394 # transcripts_info_format = transcripts_info_field_format 11395 11396 # Variants table 11397 table_variants = self.get_table_variants() 11398 11399 # Check info columns param 11400 if ( 11401 transcripts_info_json is None 11402 and transcripts_info_field_json is None 11403 and transcripts_info_format is None 11404 and transcripts_info_field_format is None 11405 ): 11406 return False 11407 11408 # Transcripts infos columns 11409 query_transcripts_infos_columns = f""" 11410 SELECT * 11411 FROM ( 11412 DESCRIBE SELECT * FROM {transcripts_table} 11413 ) 11414 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11415 """ 11416 transcripts_infos_columns = list( 11417 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11418 ) 11419 11420 # View results 11421 clause_select = [] 11422 clause_to_json = [] 11423 clause_to_format = [] 11424 for field in transcripts_infos_columns: 11425 # Do not consider INFO field for export into fields 11426 if field not in ["INFO"]: 11427 clause_select.append( 11428 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11429 ) 11430 clause_to_json.append(f""" '{field}': "{field}" """) 11431 clause_to_format.append(f""" "{field}" """) 11432 11433 # Update 11434 update_set_json = [] 11435 update_set_format = [] 11436 11437 # VCF header 11438 vcf_reader = self.get_header() 11439 11440 # Transcripts to info column in JSON 11441 if transcripts_info_json: 11442 11443 # Create column on variants table 11444 self.add_column( 11445 table_name=table_variants, 11446 column_name=transcripts_info_json, 11447 column_type="JSON", 11448 default_value=None, 11449 drop=False, 11450 ) 11451 11452 # Add header 11453 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11454 transcripts_info_json, 11455 ".", 11456 "String", 11457 "Transcripts in JSON format", 11458 "unknwon", 11459 "unknwon", 11460 self.code_type_map["String"], 11461 ) 11462 11463 # Add to update 11464 update_set_json.append( 11465 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11466 ) 11467 11468 # Transcripts to info field in JSON 11469 if transcripts_info_field_json: 11470 11471 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11472 11473 # Add to update 11474 update_set_json.append( 11475 f""" 11476 INFO = concat( 11477 CASE 11478 WHEN INFO NOT IN ('', '.') 11479 THEN INFO 11480 ELSE '' 11481 END, 11482 CASE 11483 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11484 THEN concat( 11485 ';{transcripts_info_field_json}=', 11486 t.{transcripts_info_json} 11487 ) 11488 ELSE '' 11489 END 11490 ) 11491 """ 11492 ) 11493 11494 # Add header 11495 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11496 transcripts_info_field_json, 11497 ".", 11498 "String", 11499 "Transcripts in JSON format", 11500 "unknwon", 11501 "unknwon", 11502 self.code_type_map["String"], 11503 ) 11504 11505 if update_set_json: 11506 11507 # Update query 11508 query_update = f""" 11509 UPDATE {table_variants} 11510 SET {", ".join(update_set_json)} 11511 FROM 11512 ( 11513 SELECT 11514 "#CHROM", POS, REF, ALT, 11515 concat( 11516 '{{', 11517 string_agg( 11518 '"' || "{transcripts_column_id}" || '":' || 11519 to_json(json_output) 11520 ), 11521 '}}' 11522 )::JSON AS {transcripts_info_json} 11523 FROM 11524 ( 11525 SELECT 11526 "#CHROM", POS, REF, ALT, 11527 "{transcripts_column_id}", 11528 to_json( 11529 {{{",".join(clause_to_json)}}} 11530 )::JSON AS json_output 11531 FROM 11532 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11533 WHERE "{transcripts_column_id}" IS NOT NULL 11534 ) 11535 GROUP BY "#CHROM", POS, REF, ALT 11536 ) AS t 11537 WHERE {table_variants}."#CHROM" = t."#CHROM" 11538 AND {table_variants}."POS" = t."POS" 11539 AND {table_variants}."REF" = t."REF" 11540 AND {table_variants}."ALT" = t."ALT" 11541 """ 11542 11543 self.execute_query(query=query_update) 11544 11545 # Transcripts to info column in FORMAT 11546 if transcripts_info_format: 11547 11548 # Create column on variants table 11549 self.add_column( 11550 table_name=table_variants, 11551 column_name=transcripts_info_format, 11552 column_type="VARCHAR", 11553 default_value=None, 11554 drop=False, 11555 ) 11556 11557 # Add header 11558 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11559 transcripts_info_format, 11560 ".", 11561 "String", 11562 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11563 "unknwon", 11564 "unknwon", 11565 self.code_type_map["String"], 11566 ) 11567 11568 # Add to update 11569 update_set_format.append( 11570 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11571 ) 11572 11573 else: 11574 11575 # Set variable for internal queries 11576 transcripts_info_format = "transcripts_info_format" 11577 11578 # Transcripts to info field in JSON 11579 if transcripts_info_field_format: 11580 11581 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11582 11583 # Add to update 11584 update_set_format.append( 11585 f""" 11586 INFO = concat( 11587 CASE 11588 WHEN INFO NOT IN ('', '.') 11589 THEN INFO 11590 ELSE '' 11591 END, 11592 CASE 11593 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11594 THEN concat( 11595 ';{transcripts_info_field_format}=', 11596 t.{transcripts_info_format} 11597 ) 11598 ELSE '' 11599 END 11600 ) 11601 """ 11602 ) 11603 11604 # Add header 11605 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11606 transcripts_info_field_format, 11607 ".", 11608 "String", 11609 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11610 "unknwon", 11611 "unknwon", 11612 self.code_type_map["String"], 11613 ) 11614 11615 if update_set_format: 11616 11617 # Update query 11618 query_update = f""" 11619 UPDATE {table_variants} 11620 SET {", ".join(update_set_format)} 11621 FROM 11622 ( 11623 SELECT 11624 "#CHROM", POS, REF, ALT, 11625 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11626 FROM 11627 ( 11628 SELECT 11629 "#CHROM", POS, REF, ALT, 11630 "{transcripts_column_id}", 11631 concat( 11632 "{transcripts_column_id}", 11633 '|', 11634 {", '|', ".join(clause_to_format)} 11635 ) AS {transcripts_info_format} 11636 FROM 11637 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11638 ) 11639 GROUP BY "#CHROM", POS, REF, ALT 11640 ) AS t 11641 WHERE {table_variants}."#CHROM" = t."#CHROM" 11642 AND {table_variants}."POS" = t."POS" 11643 AND {table_variants}."REF" = t."REF" 11644 AND {table_variants}."ALT" = t."ALT" 11645 """ 11646 11647 self.execute_query(query=query_update) 11648 11649 return True 11650 11651 def rename_info_fields( 11652 self, fields_to_rename: dict = None, table: str = None 11653 ) -> dict: 11654 """ 11655 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11656 corresponding INFO fields in the variants table. 11657 11658 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11659 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11660 represent the original field names that need to be renamed, and the corresponding values 11661 represent the new names to which the fields should be 11662 :type fields_to_rename: dict 11663 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11664 the table in which the variants data is stored. This table contains information about genetic 11665 variants, and the function updates the corresponding INFO fields in this table when renaming 11666 specified fields in the VCF file header 11667 :type table: str 11668 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11669 the original field names as keys and their corresponding new names (or None if the field was 11670 removed) as values after renaming or removing specified fields in a VCF file header and updating 11671 corresponding INFO fields in the variants table. 11672 """ 11673 11674 # Init 11675 fields_renamed = {} 11676 config = self.get_config() 11677 access = config.get("access") 11678 11679 if table is None: 11680 table = self.get_table_variants() 11681 11682 # regexp replace fonction 11683 regex_replace_dict = {} 11684 regex_replace_nb = 0 11685 regex_replace_partition = 125 11686 regex_replace = "INFO" 11687 11688 if fields_to_rename is not None and access not in ["RO"]: 11689 11690 log.info("Rename or remove fields...") 11691 11692 # Header 11693 header = self.get_header() 11694 11695 for field_to_rename, field_renamed in fields_to_rename.items(): 11696 11697 if field_to_rename in header.infos: 11698 11699 # Rename header 11700 if field_renamed is not None: 11701 header.infos[field_renamed] = vcf.parser._Info( 11702 field_renamed, 11703 header.infos[field_to_rename].num, 11704 header.infos[field_to_rename].type, 11705 header.infos[field_to_rename].desc, 11706 header.infos[field_to_rename].source, 11707 header.infos[field_to_rename].version, 11708 header.infos[field_to_rename].type_code, 11709 ) 11710 del header.infos[field_to_rename] 11711 11712 # Rename INFO patterns 11713 field_pattern = rf'(^|;)({field_to_rename})=([^;]*)' 11714 if field_renamed is not None: 11715 field_renamed_pattern = rf'\1{field_renamed}=\3' 11716 else: 11717 field_renamed_pattern = '' 11718 11719 # regexp replace 11720 regex_replace_nb += 1 11721 regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition) 11722 if (regex_replace_nb % regex_replace_partition) == 0: 11723 regex_replace = "INFO" 11724 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11725 regex_replace_dict[regex_replace_key] = regex_replace 11726 11727 # Return 11728 fields_renamed[field_to_rename] = field_renamed 11729 11730 # Log 11731 if field_renamed is not None: 11732 log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'") 11733 else: 11734 log.info(f"Rename or remove fields - field '{field_to_rename}' removed") 11735 11736 # Rename INFO 11737 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11738 log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...") 11739 query = f""" 11740 UPDATE {table} 11741 SET 11742 INFO = {regex_replace} 11743 """ 11744 log.debug(f"query={query}") 11745 self.execute_query(query=query) 11746 11747 return fields_renamed 11748 11749 def calculation_rename_info_fields( 11750 self, 11751 fields_to_rename: dict = None, 11752 table: str = None, 11753 operation_name: str = "RENAME_INFO_FIELDS", 11754 ) -> None: 11755 """ 11756 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11757 fields to rename and table if provided, and then calls another function to rename the fields. 11758 11759 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11760 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11761 the key and the new field name as the value 11762 :type fields_to_rename: dict 11763 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11764 specify the name of the table for which the fields are to be renamed. It is a string type 11765 parameter 11766 :type table: str 11767 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11768 method is a string that specifies the name of the operation being performed. In this context, it 11769 is used as a default value for the operation name if not explicitly provided when calling the 11770 function, defaults to RENAME_INFO_FIELDS 11771 :type operation_name: str (optional) 11772 """ 11773 11774 # Param 11775 param = self.get_param() 11776 11777 # Get param fields to rename 11778 param_fields_to_rename = ( 11779 param.get("calculation", {}) 11780 .get("calculations", {}) 11781 .get(operation_name, {}) 11782 .get("fields_to_rename", None) 11783 ) 11784 11785 # Get param table 11786 param_table = ( 11787 param.get("calculation", {}) 11788 .get("calculations", {}) 11789 .get(operation_name, {}) 11790 .get("table", None) 11791 ) 11792 11793 # Init fields_to_rename 11794 if fields_to_rename is None: 11795 fields_to_rename = param_fields_to_rename 11796 11797 # Init table 11798 if table is None: 11799 table = param_table 11800 11801 renamed_fields = self.rename_info_fields( 11802 fields_to_rename=fields_to_rename, table=table 11803 ) 11804 11805 log.debug(f"renamed_fields:{renamed_fields}")
37class Variants: 38 39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data() 86 87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples 105 106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples 113 114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True) 124 125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "") 150 151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config 164 165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param 175 176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = [] 204 205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False) 213 214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config 255 256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict 280 281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db 310 311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 356 357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None 383 384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None 485 486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df 527 528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None 570 571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats 792 793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file 815 816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None 917 918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input 924 925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format 941 942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed 959 960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output 967 968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format 986 987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config 993 994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param 1000 1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db 1007 1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix 1014 1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants 1042 1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 ) 1054 1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory") 1062 1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn 1070 1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close() 1077 1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required 1097 1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list 1111 1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0 1130 1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return "" 1141 1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return [] 1152 1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list) 1163 1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list 1222 1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False 1241 1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False) 1250 1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format 1262 1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1316 1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes() 1512 1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False) 1522 1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(field) 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return [] 1622 1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix 1641 1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column 1713 1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed 1771 1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns 1988 1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index) 2015 2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index) 2041 2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list 2056 2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f) 2075 2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None 2087 2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 ) 2304 2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns 2336 2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 ) 2351 2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name 2446 2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2519 else: 2520 samples_fields = "" 2521 log.debug(f"samples_fields: {samples_fields}") 2522 else: 2523 samples_fields = "" 2524 2525 # Where clause 2526 if where_clause is None: 2527 where_clause = "" 2528 2529 # Variants 2530 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2531 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2532 log.debug(f"sql_query_select={sql_query_select}") 2533 2534 return self.export_output( 2535 output_file=vcf_file, 2536 output_header=None, 2537 export_header=True, 2538 query=sql_query_select, 2539 parquet_partitions=None, 2540 chunk_size=config.get("chunk_size", None), 2541 threads=threads, 2542 sort=True, 2543 index=index, 2544 order_by=None, 2545 ) 2546 2547 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2548 """ 2549 It takes a list of commands and runs them in parallel using the number of threads specified 2550 2551 :param commands: A list of commands to run 2552 :param threads: The number of threads to use, defaults to 1 (optional) 2553 """ 2554 2555 run_parallel_commands(commands, threads) 2556 2557 def get_threads(self, default: int = 1) -> int: 2558 """ 2559 This function returns the number of threads to use for a job, with a default value of 1 if not 2560 specified. 2561 2562 :param default: The `default` parameter in the `get_threads` method is used to specify the 2563 default number of threads to use if no specific value is provided. If no value is provided for 2564 the `threads` parameter in the configuration or input parameters, the `default` value will be 2565 used, defaults to 1 2566 :type default: int (optional) 2567 :return: the number of threads to use for the current job. 2568 """ 2569 2570 # Config 2571 config = self.get_config() 2572 2573 # Param 2574 param = self.get_param() 2575 2576 # Input threads 2577 input_thread = param.get("threads", config.get("threads", None)) 2578 2579 # Check threads 2580 if not input_thread: 2581 threads = default 2582 elif int(input_thread) <= 0: 2583 threads = os.cpu_count() 2584 else: 2585 threads = int(input_thread) 2586 return threads 2587 2588 def get_memory(self, default: str = None) -> str: 2589 """ 2590 This function retrieves the memory value from parameters or configuration with a default value 2591 if not found. 2592 2593 :param default: The `get_memory` function takes in a default value as a string parameter. This 2594 default value is used as a fallback in case the `memory` parameter is not provided in the 2595 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2596 the function 2597 :type default: str 2598 :return: The `get_memory` function returns a string value representing the memory parameter. If 2599 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2600 return the default value provided as an argument to the function. 2601 """ 2602 2603 # Config 2604 config = self.get_config() 2605 2606 # Param 2607 param = self.get_param() 2608 2609 # Input threads 2610 input_memory = param.get("memory", config.get("memory", None)) 2611 2612 # Check threads 2613 if input_memory: 2614 memory = input_memory 2615 else: 2616 memory = default 2617 2618 return memory 2619 2620 def update_from_vcf(self, vcf_file: str) -> None: 2621 """ 2622 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2623 2624 :param vcf_file: the path to the VCF file 2625 """ 2626 2627 connexion_format = self.get_connexion_format() 2628 2629 if connexion_format in ["duckdb"]: 2630 self.update_from_vcf_duckdb(vcf_file) 2631 elif connexion_format in ["sqlite"]: 2632 self.update_from_vcf_sqlite(vcf_file) 2633 2634 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2635 """ 2636 It takes a VCF file and updates the INFO column of the variants table in the database with the 2637 INFO column of the VCF file 2638 2639 :param vcf_file: the path to the VCF file 2640 """ 2641 2642 # varaints table 2643 table_variants = self.get_table_variants() 2644 2645 # Loading VCF into temporaire table 2646 skip = self.get_header_length(file=vcf_file) 2647 vcf_df = pd.read_csv( 2648 vcf_file, 2649 sep="\t", 2650 engine="c", 2651 skiprows=skip, 2652 header=0, 2653 low_memory=False, 2654 ) 2655 sql_query_update = f""" 2656 UPDATE {table_variants} as table_variants 2657 SET INFO = concat( 2658 CASE 2659 WHEN INFO NOT IN ('', '.') 2660 THEN INFO 2661 ELSE '' 2662 END, 2663 ( 2664 SELECT 2665 concat( 2666 CASE 2667 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2668 THEN ';' 2669 ELSE '' 2670 END 2671 , 2672 CASE 2673 WHEN table_parquet.INFO NOT IN ('','.') 2674 THEN table_parquet.INFO 2675 ELSE '' 2676 END 2677 ) 2678 FROM vcf_df as table_parquet 2679 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2680 AND table_parquet.\"POS\" = table_variants.\"POS\" 2681 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2682 AND table_parquet.\"REF\" = table_variants.\"REF\" 2683 AND table_parquet.INFO NOT IN ('','.') 2684 ) 2685 ) 2686 ; 2687 """ 2688 self.conn.execute(sql_query_update) 2689 2690 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2691 """ 2692 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2693 table, then updates the INFO column of the variants table with the INFO column of the temporary 2694 table 2695 2696 :param vcf_file: The path to the VCF file you want to update the database with 2697 """ 2698 2699 # Create a temporary table for the VCF 2700 table_vcf = "tmp_vcf" 2701 sql_create = ( 2702 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2703 ) 2704 self.conn.execute(sql_create) 2705 2706 # Loading VCF into temporaire table 2707 vcf_df = pd.read_csv( 2708 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2709 ) 2710 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2711 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2712 2713 # Update table 'variants' with VCF data 2714 # warning: CONCAT as || operator 2715 sql_query_update = f""" 2716 UPDATE variants as table_variants 2717 SET INFO = CASE 2718 WHEN INFO NOT IN ('', '.') 2719 THEN INFO 2720 ELSE '' 2721 END || 2722 ( 2723 SELECT 2724 CASE 2725 WHEN table_variants.INFO NOT IN ('','.') 2726 AND table_vcf.INFO NOT IN ('','.') 2727 THEN ';' 2728 ELSE '' 2729 END || 2730 CASE 2731 WHEN table_vcf.INFO NOT IN ('','.') 2732 THEN table_vcf.INFO 2733 ELSE '' 2734 END 2735 FROM {table_vcf} as table_vcf 2736 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2737 AND table_vcf.\"POS\" = table_variants.\"POS\" 2738 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2739 AND table_vcf.\"REF\" = table_variants.\"REF\" 2740 ) 2741 """ 2742 self.conn.execute(sql_query_update) 2743 2744 # Drop temporary table 2745 sql_drop = f"DROP TABLE {table_vcf}" 2746 self.conn.execute(sql_drop) 2747 2748 def drop_variants_table(self) -> None: 2749 """ 2750 > This function drops the variants table 2751 """ 2752 2753 table_variants = self.get_table_variants() 2754 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2755 self.conn.execute(sql_table_variants) 2756 2757 def set_variant_id( 2758 self, variant_id_column: str = "variant_id", force: bool = None 2759 ) -> str: 2760 """ 2761 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2762 `#CHROM`, `POS`, `REF`, and `ALT` columns 2763 2764 :param variant_id_column: The name of the column to be created in the variants table, defaults 2765 to variant_id 2766 :type variant_id_column: str (optional) 2767 :param force: If True, the variant_id column will be created even if it already exists 2768 :type force: bool 2769 :return: The name of the column that contains the variant_id 2770 """ 2771 2772 # Assembly 2773 assembly = self.get_param().get( 2774 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2775 ) 2776 2777 # INFO/Tag prefix 2778 prefix = self.get_explode_infos_prefix() 2779 2780 # Explode INFO/SVTYPE 2781 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2782 2783 # variants table 2784 table_variants = self.get_table_variants() 2785 2786 # variant_id column 2787 if not variant_id_column: 2788 variant_id_column = "variant_id" 2789 2790 # Creta variant_id column 2791 if "variant_id" not in self.get_extra_infos() or force: 2792 2793 # Create column 2794 self.add_column( 2795 table_name=table_variants, 2796 column_name=variant_id_column, 2797 column_type="UBIGINT", 2798 default_value="0", 2799 ) 2800 2801 # Update column 2802 self.conn.execute( 2803 f""" 2804 UPDATE {table_variants} 2805 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2806 """ 2807 ) 2808 2809 # Remove added columns 2810 for added_column in added_columns: 2811 self.drop_column(column=added_column) 2812 2813 # return variant_id column name 2814 return variant_id_column 2815 2816 def get_variant_id_column( 2817 self, variant_id_column: str = "variant_id", force: bool = None 2818 ) -> str: 2819 """ 2820 This function returns the variant_id column name 2821 2822 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2823 defaults to variant_id 2824 :type variant_id_column: str (optional) 2825 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2826 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2827 if it is not already set, or if it is set 2828 :type force: bool 2829 :return: The variant_id column name. 2830 """ 2831 2832 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2833 2834 ### 2835 # Annotation 2836 ### 2837 2838 def scan_databases( 2839 self, 2840 database_formats: list = ["parquet"], 2841 database_releases: list = ["current"], 2842 ) -> dict: 2843 """ 2844 The function `scan_databases` scans for available databases based on specified formats and 2845 releases. 2846 2847 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2848 of the databases to be scanned. In this case, the accepted format is "parquet" 2849 :type database_formats: list ["parquet"] 2850 :param database_releases: The `database_releases` parameter is a list that specifies the 2851 releases of the databases to be scanned. In the provided function, the default value for 2852 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2853 databases that are in the "current" 2854 :type database_releases: list 2855 :return: The function `scan_databases` returns a dictionary containing information about 2856 databases that match the specified formats and releases. 2857 """ 2858 2859 # Config 2860 config = self.get_config() 2861 2862 # Param 2863 param = self.get_param() 2864 2865 # Param - Assembly 2866 assembly = param.get("assembly", config.get("assembly", None)) 2867 if not assembly: 2868 assembly = DEFAULT_ASSEMBLY 2869 log.warning(f"Default assembly '{assembly}'") 2870 2871 # Scan for availabled databases 2872 log.info( 2873 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2874 ) 2875 databases_infos_dict = databases_infos( 2876 database_folder_releases=database_releases, 2877 database_formats=database_formats, 2878 assembly=assembly, 2879 config=config, 2880 ) 2881 log.info( 2882 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2883 ) 2884 2885 return databases_infos_dict 2886 2887 def annotation(self) -> None: 2888 """ 2889 It annotates the VCF file with the annotations specified in the config file. 2890 """ 2891 2892 # Config 2893 config = self.get_config() 2894 2895 # Param 2896 param = self.get_param() 2897 2898 # Param - Assembly 2899 assembly = param.get("assembly", config.get("assembly", None)) 2900 if not assembly: 2901 assembly = DEFAULT_ASSEMBLY 2902 log.warning(f"Default assembly '{assembly}'") 2903 2904 # annotations databases folders 2905 annotations_databases = set( 2906 config.get("folders", {}) 2907 .get("databases", {}) 2908 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2909 + config.get("folders", {}) 2910 .get("databases", {}) 2911 .get("parquet", ["~/howard/databases/parquet/current"]) 2912 + config.get("folders", {}) 2913 .get("databases", {}) 2914 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2915 ) 2916 2917 # Get param annotations 2918 if param.get("annotations", None) and isinstance( 2919 param.get("annotations", None), str 2920 ): 2921 log.debug(param.get("annotations", None)) 2922 param_annotation_list = param.get("annotations").split(",") 2923 else: 2924 param_annotation_list = [] 2925 2926 # Each tools param 2927 if param.get("annotation_parquet", None) != None: 2928 log.debug( 2929 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2930 ) 2931 if isinstance(param.get("annotation_parquet", None), list): 2932 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2933 else: 2934 param_annotation_list.append(param.get("annotation_parquet")) 2935 if param.get("annotation_snpsift", None) != None: 2936 if isinstance(param.get("annotation_snpsift", None), list): 2937 param_annotation_list.append( 2938 "snpsift:" 2939 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2940 ) 2941 else: 2942 param_annotation_list.append( 2943 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2944 ) 2945 if param.get("annotation_snpeff", None) != None: 2946 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2947 if param.get("annotation_bcftools", None) != None: 2948 if isinstance(param.get("annotation_bcftools", None), list): 2949 param_annotation_list.append( 2950 "bcftools:" 2951 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2952 ) 2953 else: 2954 param_annotation_list.append( 2955 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2956 ) 2957 if param.get("annotation_annovar", None) != None: 2958 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2959 if param.get("annotation_exomiser", None) != None: 2960 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2961 if param.get("annotation_splice", None) != None: 2962 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2963 2964 # Merge param annotations list 2965 param["annotations"] = ",".join(param_annotation_list) 2966 2967 # debug 2968 log.debug(f"param_annotations={param['annotations']}") 2969 2970 if param.get("annotations"): 2971 2972 # Log 2973 # log.info("Annotations - Check annotation parameters") 2974 2975 if not "annotation" in param: 2976 param["annotation"] = {} 2977 2978 # List of annotations parameters 2979 annotations_list_input = {} 2980 if isinstance(param.get("annotations", None), str): 2981 annotation_file_list = [ 2982 value for value in param.get("annotations", "").split(",") 2983 ] 2984 for annotation_file in annotation_file_list: 2985 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2986 else: 2987 annotations_list_input = param.get("annotations", {}) 2988 2989 log.info(f"Quick Annotations:") 2990 for annotation_key in list(annotations_list_input.keys()): 2991 log.info(f" {annotation_key}") 2992 2993 # List of annotations and associated fields 2994 annotations_list = {} 2995 2996 for annotation_file in annotations_list_input: 2997 2998 # Explode annotations if ALL 2999 if ( 3000 annotation_file.upper() == "ALL" 3001 or annotation_file.upper().startswith("ALL:") 3002 ): 3003 3004 # check ALL parameters (formats, releases) 3005 annotation_file_split = annotation_file.split(":") 3006 database_formats = "parquet" 3007 database_releases = "current" 3008 for annotation_file_option in annotation_file_split[1:]: 3009 database_all_options_split = annotation_file_option.split("=") 3010 if database_all_options_split[0] == "format": 3011 database_formats = database_all_options_split[1].split("+") 3012 if database_all_options_split[0] == "release": 3013 database_releases = database_all_options_split[1].split("+") 3014 3015 # Scan for availabled databases 3016 databases_infos_dict = self.scan_databases( 3017 database_formats=database_formats, 3018 database_releases=database_releases, 3019 ) 3020 3021 # Add found databases in annotation parameters 3022 for database_infos in databases_infos_dict.keys(): 3023 annotations_list[database_infos] = {"INFO": None} 3024 3025 else: 3026 annotations_list[annotation_file] = annotations_list_input[ 3027 annotation_file 3028 ] 3029 3030 # Check each databases 3031 if len(annotations_list): 3032 3033 log.info( 3034 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3035 ) 3036 3037 for annotation_file in annotations_list: 3038 3039 # Init 3040 annotations = annotations_list.get(annotation_file, None) 3041 3042 # Annotation snpEff 3043 if annotation_file.startswith("snpeff"): 3044 3045 log.debug(f"Quick Annotation snpEff") 3046 3047 if "snpeff" not in param["annotation"]: 3048 param["annotation"]["snpeff"] = {} 3049 3050 if "options" not in param["annotation"]["snpeff"]: 3051 param["annotation"]["snpeff"]["options"] = "" 3052 3053 # snpEff options in annotations 3054 param["annotation"]["snpeff"]["options"] = "".join( 3055 annotation_file.split(":")[1:] 3056 ) 3057 3058 # Annotation Annovar 3059 elif annotation_file.startswith("annovar"): 3060 3061 log.debug(f"Quick Annotation Annovar") 3062 3063 if "annovar" not in param["annotation"]: 3064 param["annotation"]["annovar"] = {} 3065 3066 if "annotations" not in param["annotation"]["annovar"]: 3067 param["annotation"]["annovar"]["annotations"] = {} 3068 3069 # Options 3070 annotation_file_split = annotation_file.split(":") 3071 for annotation_file_annotation in annotation_file_split[1:]: 3072 if annotation_file_annotation: 3073 param["annotation"]["annovar"]["annotations"][ 3074 annotation_file_annotation 3075 ] = annotations 3076 3077 # Annotation Exomiser 3078 elif annotation_file.startswith("exomiser"): 3079 3080 log.debug(f"Quick Annotation Exomiser") 3081 3082 param["annotation"]["exomiser"] = params_string_to_dict( 3083 annotation_file 3084 ) 3085 3086 # Annotation Splice 3087 elif annotation_file.startswith("splice"): 3088 3089 log.debug(f"Quick Annotation Splice") 3090 3091 param["annotation"]["splice"] = params_string_to_dict( 3092 annotation_file 3093 ) 3094 3095 # Annotation Parquet or BCFTOOLS 3096 else: 3097 3098 # Tools detection 3099 if annotation_file.startswith("bcftools:"): 3100 annotation_tool_initial = "bcftools" 3101 annotation_file = ":".join(annotation_file.split(":")[1:]) 3102 elif annotation_file.startswith("snpsift:"): 3103 annotation_tool_initial = "snpsift" 3104 annotation_file = ":".join(annotation_file.split(":")[1:]) 3105 elif annotation_file.startswith("bigwig:"): 3106 annotation_tool_initial = "bigwig" 3107 annotation_file = ":".join(annotation_file.split(":")[1:]) 3108 else: 3109 annotation_tool_initial = None 3110 3111 # list of files 3112 annotation_file_list = annotation_file.replace("+", ":").split( 3113 ":" 3114 ) 3115 3116 for annotation_file in annotation_file_list: 3117 3118 if annotation_file: 3119 3120 # Annotation tool initial 3121 annotation_tool = annotation_tool_initial 3122 3123 # Find file 3124 annotation_file_found = None 3125 3126 if os.path.exists(annotation_file): 3127 annotation_file_found = annotation_file 3128 elif os.path.exists(full_path(annotation_file)): 3129 annotation_file_found = full_path(annotation_file) 3130 else: 3131 # Find within assembly folders 3132 for annotations_database in annotations_databases: 3133 found_files = find_all( 3134 annotation_file, 3135 os.path.join( 3136 annotations_database, assembly 3137 ), 3138 ) 3139 if len(found_files) > 0: 3140 annotation_file_found = found_files[0] 3141 break 3142 if not annotation_file_found and not assembly: 3143 # Find within folders 3144 for ( 3145 annotations_database 3146 ) in annotations_databases: 3147 found_files = find_all( 3148 annotation_file, annotations_database 3149 ) 3150 if len(found_files) > 0: 3151 annotation_file_found = found_files[0] 3152 break 3153 log.debug( 3154 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3155 ) 3156 3157 # Full path 3158 annotation_file_found = full_path(annotation_file_found) 3159 3160 if annotation_file_found: 3161 3162 database = Database(database=annotation_file_found) 3163 quick_annotation_format = database.get_format() 3164 quick_annotation_is_compressed = ( 3165 database.is_compressed() 3166 ) 3167 quick_annotation_is_indexed = os.path.exists( 3168 f"{annotation_file_found}.tbi" 3169 ) 3170 bcftools_preference = False 3171 3172 # Check Annotation Tool 3173 if not annotation_tool: 3174 if ( 3175 bcftools_preference 3176 and quick_annotation_format 3177 in ["vcf", "bed"] 3178 and quick_annotation_is_compressed 3179 and quick_annotation_is_indexed 3180 ): 3181 annotation_tool = "bcftools" 3182 elif quick_annotation_format in [ 3183 "vcf", 3184 "bed", 3185 "tsv", 3186 "tsv", 3187 "csv", 3188 "json", 3189 "tbl", 3190 "parquet", 3191 "duckdb", 3192 ]: 3193 annotation_tool = "parquet" 3194 elif quick_annotation_format in ["bw"]: 3195 annotation_tool = "bigwig" 3196 else: 3197 log.error( 3198 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3199 ) 3200 raise ValueError( 3201 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3202 ) 3203 3204 log.debug( 3205 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3206 ) 3207 3208 # Annotation Tool dispatch 3209 if annotation_tool: 3210 if annotation_tool not in param["annotation"]: 3211 param["annotation"][annotation_tool] = {} 3212 if ( 3213 "annotations" 3214 not in param["annotation"][annotation_tool] 3215 ): 3216 param["annotation"][annotation_tool][ 3217 "annotations" 3218 ] = {} 3219 param["annotation"][annotation_tool][ 3220 "annotations" 3221 ][annotation_file_found] = annotations 3222 3223 else: 3224 log.warning( 3225 f"Quick Annotation File {annotation_file} does NOT exist" 3226 ) 3227 3228 self.set_param(param) 3229 3230 if param.get("annotation", None): 3231 log.info("Annotations") 3232 if param.get("annotation", {}).get("parquet", None): 3233 log.info("Annotations 'parquet'...") 3234 self.annotation_parquet() 3235 if param.get("annotation", {}).get("bcftools", None): 3236 log.info("Annotations 'bcftools'...") 3237 self.annotation_bcftools() 3238 if param.get("annotation", {}).get("snpsift", None): 3239 log.info("Annotations 'snpsift'...") 3240 self.annotation_snpsift() 3241 if param.get("annotation", {}).get("bigwig", None): 3242 log.info("Annotations 'bigwig'...") 3243 self.annotation_bigwig() 3244 if param.get("annotation", {}).get("annovar", None): 3245 log.info("Annotations 'annovar'...") 3246 self.annotation_annovar() 3247 if param.get("annotation", {}).get("snpeff", None): 3248 log.info("Annotations 'snpeff'...") 3249 self.annotation_snpeff() 3250 if param.get("annotation", {}).get("exomiser", None) is not None: 3251 log.info("Annotations 'exomiser'...") 3252 self.annotation_exomiser() 3253 if param.get("annotation", {}).get("splice", None) is not None: 3254 log.info("Annotations 'splice' ...") 3255 self.annotation_splice() 3256 3257 # Explode INFOS fields into table fields 3258 if self.get_explode_infos(): 3259 self.explode_infos( 3260 prefix=self.get_explode_infos_prefix(), 3261 fields=self.get_explode_infos_fields(), 3262 force=True, 3263 ) 3264 3265 def annotation_bigwig(self, threads: int = None) -> None: 3266 """ 3267 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3268 3269 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3270 number of threads to be used for parallel processing during the annotation process. If the 3271 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3272 threads to use based on the system configuration 3273 :type threads: int 3274 :return: True 3275 """ 3276 3277 # DEBUG 3278 log.debug("Start annotation with bigwig databases") 3279 3280 # # Threads 3281 # if not threads: 3282 # threads = self.get_threads() 3283 # log.debug("Threads: " + str(threads)) 3284 3285 # Config 3286 config = self.get_config() 3287 log.debug("Config: " + str(config)) 3288 3289 # Config - BCFTools databases folders 3290 databases_folders = set( 3291 self.get_config() 3292 .get("folders", {}) 3293 .get("databases", {}) 3294 .get("annotations", ["."]) 3295 + self.get_config() 3296 .get("folders", {}) 3297 .get("databases", {}) 3298 .get("bigwig", ["."]) 3299 ) 3300 log.debug("Databases annotations: " + str(databases_folders)) 3301 3302 # Param 3303 annotations = ( 3304 self.get_param() 3305 .get("annotation", {}) 3306 .get("bigwig", {}) 3307 .get("annotations", None) 3308 ) 3309 log.debug("Annotations: " + str(annotations)) 3310 3311 # Assembly 3312 assembly = self.get_param().get( 3313 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3314 ) 3315 3316 # Data 3317 table_variants = self.get_table_variants() 3318 3319 # Check if not empty 3320 log.debug("Check if not empty") 3321 sql_query_chromosomes = ( 3322 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3323 ) 3324 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3325 if not sql_query_chromosomes_df["count"][0]: 3326 log.info(f"VCF empty") 3327 return 3328 3329 # VCF header 3330 vcf_reader = self.get_header() 3331 log.debug("Initial header: " + str(vcf_reader.infos)) 3332 3333 # Existing annotations 3334 for vcf_annotation in self.get_header().infos: 3335 3336 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3337 log.debug( 3338 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3339 ) 3340 3341 if annotations: 3342 3343 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3344 3345 # Export VCF file 3346 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3347 3348 # annotation_bigwig_config 3349 annotation_bigwig_config_list = [] 3350 3351 for annotation in annotations: 3352 annotation_fields = annotations[annotation] 3353 3354 # Annotation Name 3355 annotation_name = os.path.basename(annotation) 3356 3357 if not annotation_fields: 3358 annotation_fields = {"INFO": None} 3359 3360 log.debug(f"Annotation '{annotation_name}'") 3361 log.debug( 3362 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3363 ) 3364 3365 # Create Database 3366 database = Database( 3367 database=annotation, 3368 databases_folders=databases_folders, 3369 assembly=assembly, 3370 ) 3371 3372 # Find files 3373 db_file = database.get_database() 3374 db_file = full_path(db_file) 3375 db_hdr_file = database.get_header_file() 3376 db_hdr_file = full_path(db_hdr_file) 3377 db_file_type = database.get_format() 3378 3379 # If db_file is http ? 3380 if database.get_database().startswith("http"): 3381 3382 # Datbase is HTTP URL 3383 db_file_is_http = True 3384 3385 # DB file keep as URL 3386 db_file = database.get_database() 3387 log.warning( 3388 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3389 ) 3390 3391 # Retrieve automatic annotation field name 3392 annotation_field = clean_annotation_field( 3393 os.path.basename(db_file).replace(".bw", "") 3394 ) 3395 log.debug( 3396 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3397 ) 3398 3399 # Create automatic header file 3400 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3401 with open(db_hdr_file, "w") as f: 3402 f.write("##fileformat=VCFv4.2\n") 3403 f.write( 3404 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3405 ) 3406 f.write(f"#CHROM START END {annotation_field}\n") 3407 3408 else: 3409 3410 # Datbase is NOT HTTP URL 3411 db_file_is_http = False 3412 3413 # Check index - try to create if not exists 3414 if ( 3415 db_file is None 3416 or db_hdr_file is None 3417 or (not os.path.exists(db_file) and not db_file_is_http) 3418 or not os.path.exists(db_hdr_file) 3419 or not db_file_type in ["bw"] 3420 ): 3421 # if False: 3422 log.error("Annotation failed: database not valid") 3423 log.error(f"Annotation annotation file: {db_file}") 3424 log.error(f"Annotation annotation file type: {db_file_type}") 3425 log.error(f"Annotation annotation header: {db_hdr_file}") 3426 raise ValueError( 3427 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3428 ) 3429 else: 3430 3431 # Log 3432 log.debug( 3433 f"Annotation '{annotation}' - file: " 3434 + str(db_file) 3435 + " and " 3436 + str(db_hdr_file) 3437 ) 3438 3439 # Load header as VCF object 3440 db_hdr_vcf = Variants(input=db_hdr_file) 3441 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3442 log.debug( 3443 "Annotation database header: " 3444 + str(db_hdr_vcf_header_infos) 3445 ) 3446 3447 # For all fields in database 3448 annotation_fields_full = False 3449 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3450 annotation_fields = { 3451 key: key for key in db_hdr_vcf_header_infos 3452 } 3453 log.debug( 3454 "Annotation database header - All annotations added: " 3455 + str(annotation_fields) 3456 ) 3457 annotation_fields_full = True 3458 3459 # Init 3460 cyvcf2_header_rename_dict = {} 3461 cyvcf2_header_list = [] 3462 cyvcf2_header_indexes = {} 3463 3464 # process annotation fields 3465 for annotation_field in annotation_fields: 3466 3467 # New annotation name 3468 annotation_field_new = annotation_fields[annotation_field] 3469 3470 # Check annotation field and index in header 3471 if ( 3472 annotation_field 3473 in db_hdr_vcf.get_header_columns_as_list() 3474 ): 3475 annotation_field_index = ( 3476 db_hdr_vcf.get_header_columns_as_list().index( 3477 annotation_field 3478 ) 3479 - 3 3480 ) 3481 cyvcf2_header_indexes[annotation_field_new] = ( 3482 annotation_field_index 3483 ) 3484 else: 3485 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3486 log.error(msg_err) 3487 raise ValueError(msg_err) 3488 3489 # Append annotation field in cyvcf2 header list 3490 cyvcf2_header_rename_dict[annotation_field_new] = ( 3491 db_hdr_vcf_header_infos[annotation_field].id 3492 ) 3493 cyvcf2_header_list.append( 3494 { 3495 "ID": annotation_field_new, 3496 "Number": db_hdr_vcf_header_infos[ 3497 annotation_field 3498 ].num, 3499 "Type": db_hdr_vcf_header_infos[ 3500 annotation_field 3501 ].type, 3502 "Description": db_hdr_vcf_header_infos[ 3503 annotation_field 3504 ].desc, 3505 } 3506 ) 3507 3508 # Add header on VCF 3509 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3510 annotation_field_new, 3511 db_hdr_vcf_header_infos[annotation_field].num, 3512 db_hdr_vcf_header_infos[annotation_field].type, 3513 db_hdr_vcf_header_infos[annotation_field].desc, 3514 "HOWARD BigWig annotation", 3515 "unknown", 3516 self.code_type_map[ 3517 db_hdr_vcf_header_infos[annotation_field].type 3518 ], 3519 ) 3520 3521 # Load bigwig database 3522 bw_db = pyBigWig.open(db_file) 3523 if bw_db.isBigWig(): 3524 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3525 else: 3526 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3527 log.error(msg_err) 3528 raise ValueError(msg_err) 3529 3530 annotation_bigwig_config_list.append( 3531 { 3532 "db_file": db_file, 3533 "bw_db": bw_db, 3534 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3535 "cyvcf2_header_list": cyvcf2_header_list, 3536 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3537 } 3538 ) 3539 3540 # Annotate 3541 if annotation_bigwig_config_list: 3542 3543 # Annotation config 3544 log.debug( 3545 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3546 ) 3547 3548 # Export VCF file 3549 self.export_variant_vcf( 3550 vcf_file=tmp_vcf_name, 3551 remove_info=True, 3552 add_samples=False, 3553 index=True, 3554 ) 3555 3556 # Load input tmp file 3557 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3558 3559 # Add header in input file 3560 for annotation_bigwig_config in annotation_bigwig_config_list: 3561 for cyvcf2_header_field in annotation_bigwig_config.get( 3562 "cyvcf2_header_list", [] 3563 ): 3564 log.info( 3565 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3566 ) 3567 input_vcf.add_info_to_header(cyvcf2_header_field) 3568 3569 # Create output VCF file 3570 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3571 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3572 3573 # Fetch variants 3574 log.info(f"Annotations 'bigwig' start...") 3575 for variant in input_vcf: 3576 3577 for annotation_bigwig_config in annotation_bigwig_config_list: 3578 3579 # DB and indexes 3580 bw_db = annotation_bigwig_config.get("bw_db", None) 3581 cyvcf2_header_indexes = annotation_bigwig_config.get( 3582 "cyvcf2_header_indexes", None 3583 ) 3584 3585 # Retrieve value from chrom pos 3586 res = bw_db.values( 3587 variant.CHROM, variant.POS - 1, variant.POS 3588 ) 3589 3590 # For each annotation fields (and indexes) 3591 for cyvcf2_header_index in cyvcf2_header_indexes: 3592 3593 # If value is NOT nNone 3594 if not np.isnan( 3595 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3596 ): 3597 variant.INFO[cyvcf2_header_index] = res[ 3598 cyvcf2_header_indexes[cyvcf2_header_index] 3599 ] 3600 3601 # Add record in output file 3602 output_vcf.write_record(variant) 3603 3604 # Log 3605 log.debug(f"Annotation done.") 3606 3607 # Close and write file 3608 log.info(f"Annotations 'bigwig' write...") 3609 output_vcf.close() 3610 log.debug(f"Write done.") 3611 3612 # Update variants 3613 log.info(f"Annotations 'bigwig' update...") 3614 self.update_from_vcf(output_vcf_file) 3615 log.debug(f"Update done.") 3616 3617 return True 3618 3619 def annotation_snpsift(self, threads: int = None) -> None: 3620 """ 3621 This function annotate with bcftools 3622 3623 :param threads: Number of threads to use 3624 :return: the value of the variable "return_value". 3625 """ 3626 3627 # DEBUG 3628 log.debug("Start annotation with bcftools databases") 3629 3630 # Threads 3631 if not threads: 3632 threads = self.get_threads() 3633 log.debug("Threads: " + str(threads)) 3634 3635 # Config 3636 config = self.get_config() 3637 log.debug("Config: " + str(config)) 3638 3639 # Config - snpSift 3640 snpsift_bin_command = get_bin_command( 3641 bin="SnpSift.jar", 3642 tool="snpsift", 3643 bin_type="jar", 3644 config=config, 3645 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3646 ) 3647 if not snpsift_bin_command: 3648 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3649 log.error(msg_err) 3650 raise ValueError(msg_err) 3651 3652 # Config - bcftools 3653 bcftools_bin_command = get_bin_command( 3654 bin="bcftools", 3655 tool="bcftools", 3656 bin_type="bin", 3657 config=config, 3658 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3659 ) 3660 if not bcftools_bin_command: 3661 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3662 log.error(msg_err) 3663 raise ValueError(msg_err) 3664 3665 # Config - BCFTools databases folders 3666 databases_folders = set( 3667 self.get_config() 3668 .get("folders", {}) 3669 .get("databases", {}) 3670 .get("annotations", ["."]) 3671 + self.get_config() 3672 .get("folders", {}) 3673 .get("databases", {}) 3674 .get("bcftools", ["."]) 3675 ) 3676 log.debug("Databases annotations: " + str(databases_folders)) 3677 3678 # Param 3679 annotations = ( 3680 self.get_param() 3681 .get("annotation", {}) 3682 .get("snpsift", {}) 3683 .get("annotations", None) 3684 ) 3685 log.debug("Annotations: " + str(annotations)) 3686 3687 # Assembly 3688 assembly = self.get_param().get( 3689 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3690 ) 3691 3692 # Data 3693 table_variants = self.get_table_variants() 3694 3695 # Check if not empty 3696 log.debug("Check if not empty") 3697 sql_query_chromosomes = ( 3698 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3699 ) 3700 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3701 if not sql_query_chromosomes_df["count"][0]: 3702 log.info(f"VCF empty") 3703 return 3704 3705 # VCF header 3706 vcf_reader = self.get_header() 3707 log.debug("Initial header: " + str(vcf_reader.infos)) 3708 3709 # Existing annotations 3710 for vcf_annotation in self.get_header().infos: 3711 3712 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3713 log.debug( 3714 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3715 ) 3716 3717 if annotations: 3718 3719 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3720 3721 # Export VCF file 3722 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3723 3724 # Init 3725 commands = {} 3726 3727 for annotation in annotations: 3728 annotation_fields = annotations[annotation] 3729 3730 # Annotation Name 3731 annotation_name = os.path.basename(annotation) 3732 3733 if not annotation_fields: 3734 annotation_fields = {"INFO": None} 3735 3736 log.debug(f"Annotation '{annotation_name}'") 3737 log.debug( 3738 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3739 ) 3740 3741 # Create Database 3742 database = Database( 3743 database=annotation, 3744 databases_folders=databases_folders, 3745 assembly=assembly, 3746 ) 3747 3748 # Find files 3749 db_file = database.get_database() 3750 db_file = full_path(db_file) 3751 db_hdr_file = database.get_header_file() 3752 db_hdr_file = full_path(db_hdr_file) 3753 db_file_type = database.get_format() 3754 db_tbi_file = f"{db_file}.tbi" 3755 db_file_compressed = database.is_compressed() 3756 3757 # Check if compressed 3758 if not db_file_compressed: 3759 log.error( 3760 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3761 ) 3762 raise ValueError( 3763 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3764 ) 3765 3766 # Check if indexed 3767 if not os.path.exists(db_tbi_file): 3768 log.error( 3769 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3770 ) 3771 raise ValueError( 3772 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3773 ) 3774 3775 # Check index - try to create if not exists 3776 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3777 log.error("Annotation failed: database not valid") 3778 log.error(f"Annotation annotation file: {db_file}") 3779 log.error(f"Annotation annotation header: {db_hdr_file}") 3780 log.error(f"Annotation annotation index: {db_tbi_file}") 3781 raise ValueError( 3782 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3783 ) 3784 else: 3785 3786 log.debug( 3787 f"Annotation '{annotation}' - file: " 3788 + str(db_file) 3789 + " and " 3790 + str(db_hdr_file) 3791 ) 3792 3793 # Load header as VCF object 3794 db_hdr_vcf = Variants(input=db_hdr_file) 3795 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3796 log.debug( 3797 "Annotation database header: " 3798 + str(db_hdr_vcf_header_infos) 3799 ) 3800 3801 # For all fields in database 3802 annotation_fields_full = False 3803 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3804 annotation_fields = { 3805 key: key for key in db_hdr_vcf_header_infos 3806 } 3807 log.debug( 3808 "Annotation database header - All annotations added: " 3809 + str(annotation_fields) 3810 ) 3811 annotation_fields_full = True 3812 3813 # # Create file for field rename 3814 # log.debug("Create file for field rename") 3815 # tmp_rename = NamedTemporaryFile( 3816 # prefix=self.get_prefix(), 3817 # dir=self.get_tmp_dir(), 3818 # suffix=".rename", 3819 # delete=False, 3820 # ) 3821 # tmp_rename_name = tmp_rename.name 3822 # tmp_files.append(tmp_rename_name) 3823 3824 # Number of fields 3825 nb_annotation_field = 0 3826 annotation_list = [] 3827 annotation_infos_rename_list = [] 3828 3829 for annotation_field in annotation_fields: 3830 3831 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3832 annotation_fields_new_name = annotation_fields.get( 3833 annotation_field, annotation_field 3834 ) 3835 if not annotation_fields_new_name: 3836 annotation_fields_new_name = annotation_field 3837 3838 # Check if field is in DB and if field is not elready in input data 3839 if ( 3840 annotation_field in db_hdr_vcf.get_header().infos 3841 and annotation_fields_new_name 3842 not in self.get_header().infos 3843 ): 3844 3845 log.info( 3846 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3847 ) 3848 3849 # BCFTools annotate param to rename fields 3850 if annotation_field != annotation_fields_new_name: 3851 annotation_infos_rename_list.append( 3852 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3853 ) 3854 3855 # Add INFO field to header 3856 db_hdr_vcf_header_infos_number = ( 3857 db_hdr_vcf_header_infos[annotation_field].num or "." 3858 ) 3859 db_hdr_vcf_header_infos_type = ( 3860 db_hdr_vcf_header_infos[annotation_field].type 3861 or "String" 3862 ) 3863 db_hdr_vcf_header_infos_description = ( 3864 db_hdr_vcf_header_infos[annotation_field].desc 3865 or f"{annotation_field} description" 3866 ) 3867 db_hdr_vcf_header_infos_source = ( 3868 db_hdr_vcf_header_infos[annotation_field].source 3869 or "unknown" 3870 ) 3871 db_hdr_vcf_header_infos_version = ( 3872 db_hdr_vcf_header_infos[annotation_field].version 3873 or "unknown" 3874 ) 3875 3876 vcf_reader.infos[annotation_fields_new_name] = ( 3877 vcf.parser._Info( 3878 annotation_fields_new_name, 3879 db_hdr_vcf_header_infos_number, 3880 db_hdr_vcf_header_infos_type, 3881 db_hdr_vcf_header_infos_description, 3882 db_hdr_vcf_header_infos_source, 3883 db_hdr_vcf_header_infos_version, 3884 self.code_type_map[ 3885 db_hdr_vcf_header_infos_type 3886 ], 3887 ) 3888 ) 3889 3890 annotation_list.append(annotation_field) 3891 3892 nb_annotation_field += 1 3893 3894 else: 3895 3896 if ( 3897 annotation_field 3898 not in db_hdr_vcf.get_header().infos 3899 ): 3900 log.warning( 3901 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3902 ) 3903 if ( 3904 annotation_fields_new_name 3905 in self.get_header().infos 3906 ): 3907 log.warning( 3908 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3909 ) 3910 3911 log.info( 3912 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3913 ) 3914 3915 annotation_infos = ",".join(annotation_list) 3916 3917 if annotation_infos != "": 3918 3919 # Annotated VCF (and error file) 3920 tmp_annotation_vcf_name = os.path.join( 3921 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3922 ) 3923 tmp_annotation_vcf_name_err = ( 3924 tmp_annotation_vcf_name + ".err" 3925 ) 3926 3927 # Add fields to annotate 3928 if not annotation_fields_full: 3929 annotation_infos_option = f"-info {annotation_infos}" 3930 else: 3931 annotation_infos_option = "" 3932 3933 # Info fields rename 3934 if annotation_infos_rename_list: 3935 annotation_infos_rename = " -c " + ",".join( 3936 annotation_infos_rename_list 3937 ) 3938 else: 3939 annotation_infos_rename = "" 3940 3941 # Annotate command 3942 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3943 3944 # Add command 3945 commands[command_annotate] = tmp_annotation_vcf_name 3946 3947 if commands: 3948 3949 # Export VCF file 3950 self.export_variant_vcf( 3951 vcf_file=tmp_vcf_name, 3952 remove_info=True, 3953 add_samples=False, 3954 index=True, 3955 ) 3956 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3957 3958 # Num command 3959 nb_command = 0 3960 3961 # Annotate 3962 for command_annotate in commands: 3963 nb_command += 1 3964 log.info( 3965 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3966 ) 3967 log.debug(f"command_annotate={command_annotate}") 3968 run_parallel_commands([command_annotate], threads) 3969 3970 # Debug 3971 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3972 3973 # Update variants 3974 log.info( 3975 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3976 ) 3977 self.update_from_vcf(commands[command_annotate]) 3978 3979 def annotation_bcftools(self, threads: int = None) -> None: 3980 """ 3981 This function annotate with bcftools 3982 3983 :param threads: Number of threads to use 3984 :return: the value of the variable "return_value". 3985 """ 3986 3987 # DEBUG 3988 log.debug("Start annotation with bcftools databases") 3989 3990 # Threads 3991 if not threads: 3992 threads = self.get_threads() 3993 log.debug("Threads: " + str(threads)) 3994 3995 # Config 3996 config = self.get_config() 3997 log.debug("Config: " + str(config)) 3998 3999 # DEBUG 4000 delete_tmp = True 4001 if self.get_config().get("verbosity", "warning") in ["debug"]: 4002 delete_tmp = False 4003 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4004 4005 # Config - BCFTools bin command 4006 bcftools_bin_command = get_bin_command( 4007 bin="bcftools", 4008 tool="bcftools", 4009 bin_type="bin", 4010 config=config, 4011 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4012 ) 4013 if not bcftools_bin_command: 4014 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4015 log.error(msg_err) 4016 raise ValueError(msg_err) 4017 4018 # Config - BCFTools databases folders 4019 databases_folders = set( 4020 self.get_config() 4021 .get("folders", {}) 4022 .get("databases", {}) 4023 .get("annotations", ["."]) 4024 + self.get_config() 4025 .get("folders", {}) 4026 .get("databases", {}) 4027 .get("bcftools", ["."]) 4028 ) 4029 log.debug("Databases annotations: " + str(databases_folders)) 4030 4031 # Param 4032 annotations = ( 4033 self.get_param() 4034 .get("annotation", {}) 4035 .get("bcftools", {}) 4036 .get("annotations", None) 4037 ) 4038 log.debug("Annotations: " + str(annotations)) 4039 4040 # Assembly 4041 assembly = self.get_param().get( 4042 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4043 ) 4044 4045 # Data 4046 table_variants = self.get_table_variants() 4047 4048 # Check if not empty 4049 log.debug("Check if not empty") 4050 sql_query_chromosomes = ( 4051 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4052 ) 4053 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4054 if not sql_query_chromosomes_df["count"][0]: 4055 log.info(f"VCF empty") 4056 return 4057 4058 # Export in VCF 4059 log.debug("Create initial file to annotate") 4060 tmp_vcf = NamedTemporaryFile( 4061 prefix=self.get_prefix(), 4062 dir=self.get_tmp_dir(), 4063 suffix=".vcf.gz", 4064 delete=False, 4065 ) 4066 tmp_vcf_name = tmp_vcf.name 4067 4068 # VCF header 4069 vcf_reader = self.get_header() 4070 log.debug("Initial header: " + str(vcf_reader.infos)) 4071 4072 # Existing annotations 4073 for vcf_annotation in self.get_header().infos: 4074 4075 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4076 log.debug( 4077 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4078 ) 4079 4080 if annotations: 4081 4082 tmp_ann_vcf_list = [] 4083 commands = [] 4084 tmp_files = [] 4085 err_files = [] 4086 4087 for annotation in annotations: 4088 annotation_fields = annotations[annotation] 4089 4090 # Annotation Name 4091 annotation_name = os.path.basename(annotation) 4092 4093 if not annotation_fields: 4094 annotation_fields = {"INFO": None} 4095 4096 log.debug(f"Annotation '{annotation_name}'") 4097 log.debug( 4098 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4099 ) 4100 4101 # Create Database 4102 database = Database( 4103 database=annotation, 4104 databases_folders=databases_folders, 4105 assembly=assembly, 4106 ) 4107 4108 # Find files 4109 db_file = database.get_database() 4110 db_file = full_path(db_file) 4111 db_hdr_file = database.get_header_file() 4112 db_hdr_file = full_path(db_hdr_file) 4113 db_file_type = database.get_format() 4114 db_tbi_file = f"{db_file}.tbi" 4115 db_file_compressed = database.is_compressed() 4116 4117 # Check if compressed 4118 if not db_file_compressed: 4119 log.error( 4120 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4121 ) 4122 raise ValueError( 4123 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4124 ) 4125 4126 # Check if indexed 4127 if not os.path.exists(db_tbi_file): 4128 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4129 raise ValueError( 4130 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4131 ) 4132 4133 # Check index - try to create if not exists 4134 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4135 log.error("Annotation failed: database not valid") 4136 log.error(f"Annotation annotation file: {db_file}") 4137 log.error(f"Annotation annotation header: {db_hdr_file}") 4138 log.error(f"Annotation annotation index: {db_tbi_file}") 4139 raise ValueError( 4140 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4141 ) 4142 else: 4143 4144 log.debug( 4145 f"Annotation '{annotation}' - file: " 4146 + str(db_file) 4147 + " and " 4148 + str(db_hdr_file) 4149 ) 4150 4151 # Load header as VCF object 4152 db_hdr_vcf = Variants(input=db_hdr_file) 4153 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4154 log.debug( 4155 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4156 ) 4157 4158 # For all fields in database 4159 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4160 annotation_fields = { 4161 key: key for key in db_hdr_vcf_header_infos 4162 } 4163 log.debug( 4164 "Annotation database header - All annotations added: " 4165 + str(annotation_fields) 4166 ) 4167 4168 # Number of fields 4169 nb_annotation_field = 0 4170 annotation_list = [] 4171 4172 for annotation_field in annotation_fields: 4173 4174 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4175 annotation_fields_new_name = annotation_fields.get( 4176 annotation_field, annotation_field 4177 ) 4178 if not annotation_fields_new_name: 4179 annotation_fields_new_name = annotation_field 4180 4181 # Check if field is in DB and if field is not elready in input data 4182 if ( 4183 annotation_field in db_hdr_vcf.get_header().infos 4184 and annotation_fields_new_name 4185 not in self.get_header().infos 4186 ): 4187 4188 log.info( 4189 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4190 ) 4191 4192 # Add INFO field to header 4193 db_hdr_vcf_header_infos_number = ( 4194 db_hdr_vcf_header_infos[annotation_field].num or "." 4195 ) 4196 db_hdr_vcf_header_infos_type = ( 4197 db_hdr_vcf_header_infos[annotation_field].type 4198 or "String" 4199 ) 4200 db_hdr_vcf_header_infos_description = ( 4201 db_hdr_vcf_header_infos[annotation_field].desc 4202 or f"{annotation_field} description" 4203 ) 4204 db_hdr_vcf_header_infos_source = ( 4205 db_hdr_vcf_header_infos[annotation_field].source 4206 or "unknown" 4207 ) 4208 db_hdr_vcf_header_infos_version = ( 4209 db_hdr_vcf_header_infos[annotation_field].version 4210 or "unknown" 4211 ) 4212 4213 vcf_reader.infos[annotation_fields_new_name] = ( 4214 vcf.parser._Info( 4215 annotation_fields_new_name, 4216 db_hdr_vcf_header_infos_number, 4217 db_hdr_vcf_header_infos_type, 4218 db_hdr_vcf_header_infos_description, 4219 db_hdr_vcf_header_infos_source, 4220 db_hdr_vcf_header_infos_version, 4221 self.code_type_map[db_hdr_vcf_header_infos_type], 4222 ) 4223 ) 4224 4225 # annotation_list.append(annotation_field) 4226 if annotation_field != annotation_fields_new_name: 4227 annotation_list.append( 4228 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4229 ) 4230 else: 4231 annotation_list.append(annotation_field) 4232 4233 nb_annotation_field += 1 4234 4235 else: 4236 4237 if annotation_field not in db_hdr_vcf.get_header().infos: 4238 log.warning( 4239 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4240 ) 4241 if annotation_fields_new_name in self.get_header().infos: 4242 log.warning( 4243 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4244 ) 4245 4246 log.info( 4247 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4248 ) 4249 4250 annotation_infos = ",".join(annotation_list) 4251 4252 if annotation_infos != "": 4253 4254 # Protect header for bcftools (remove "#CHROM" and variants line) 4255 log.debug("Protect Header file - remove #CHROM line if exists") 4256 tmp_header_vcf = NamedTemporaryFile( 4257 prefix=self.get_prefix(), 4258 dir=self.get_tmp_dir(), 4259 suffix=".hdr", 4260 delete=False, 4261 ) 4262 tmp_header_vcf_name = tmp_header_vcf.name 4263 tmp_files.append(tmp_header_vcf_name) 4264 # Command 4265 if db_hdr_file.endswith(".gz"): 4266 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4267 else: 4268 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 # Run 4270 run_parallel_commands([command_extract_header], 1) 4271 4272 # Find chomosomes 4273 log.debug("Find chromosomes ") 4274 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4275 sql_query_chromosomes_df = self.get_query_to_df( 4276 sql_query_chromosomes 4277 ) 4278 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4279 4280 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4281 4282 # BED columns in the annotation file 4283 if db_file_type in ["bed"]: 4284 annotation_infos = "CHROM,POS,POS," + annotation_infos 4285 4286 for chrom in chomosomes_list: 4287 4288 # Create BED on initial VCF 4289 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4290 tmp_bed = NamedTemporaryFile( 4291 prefix=self.get_prefix(), 4292 dir=self.get_tmp_dir(), 4293 suffix=".bed", 4294 delete=False, 4295 ) 4296 tmp_bed_name = tmp_bed.name 4297 tmp_files.append(tmp_bed_name) 4298 4299 # Detecte regions 4300 log.debug( 4301 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4302 ) 4303 window = 1000000 4304 sql_query_intervals_for_bed = f""" 4305 SELECT \"#CHROM\", 4306 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4307 \"POS\"+{window} 4308 FROM {table_variants} as table_variants 4309 WHERE table_variants.\"#CHROM\" = '{chrom}' 4310 """ 4311 regions = self.conn.execute( 4312 sql_query_intervals_for_bed 4313 ).fetchall() 4314 merged_regions = merge_regions(regions) 4315 log.debug( 4316 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4317 ) 4318 4319 header = ["#CHROM", "START", "END"] 4320 with open(tmp_bed_name, "w") as f: 4321 # Write the header with tab delimiter 4322 f.write("\t".join(header) + "\n") 4323 for d in merged_regions: 4324 # Write each data row with tab delimiter 4325 f.write("\t".join(map(str, d)) + "\n") 4326 4327 # Tmp files 4328 tmp_annotation_vcf = NamedTemporaryFile( 4329 prefix=self.get_prefix(), 4330 dir=self.get_tmp_dir(), 4331 suffix=".vcf.gz", 4332 delete=False, 4333 ) 4334 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4335 tmp_files.append(tmp_annotation_vcf_name) 4336 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4337 tmp_annotation_vcf_name_err = ( 4338 tmp_annotation_vcf_name + ".err" 4339 ) 4340 err_files.append(tmp_annotation_vcf_name_err) 4341 4342 # Annotate Command 4343 log.debug( 4344 f"Annotation '{annotation}' - add bcftools command" 4345 ) 4346 4347 # Command 4348 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4349 4350 # Add command 4351 commands.append(command_annotate) 4352 4353 # if some commands 4354 if commands: 4355 4356 # Export VCF file 4357 self.export_variant_vcf( 4358 vcf_file=tmp_vcf_name, 4359 remove_info=True, 4360 add_samples=False, 4361 index=True, 4362 ) 4363 4364 # Threads 4365 # calculate threads for annotated commands 4366 if commands: 4367 threads_bcftools_annotate = round(threads / len(commands)) 4368 else: 4369 threads_bcftools_annotate = 1 4370 4371 if not threads_bcftools_annotate: 4372 threads_bcftools_annotate = 1 4373 4374 # Add threads option to bcftools commands 4375 if threads_bcftools_annotate > 1: 4376 commands_threaded = [] 4377 for command in commands: 4378 commands_threaded.append( 4379 command.replace( 4380 f"{bcftools_bin_command} annotate ", 4381 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4382 ) 4383 ) 4384 commands = commands_threaded 4385 4386 # Command annotation multithreading 4387 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4388 log.info( 4389 f"Annotation - Annotation multithreaded in " 4390 + str(len(commands)) 4391 + " commands" 4392 ) 4393 4394 run_parallel_commands(commands, threads) 4395 4396 # Merge 4397 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4398 4399 if tmp_ann_vcf_list_cmd: 4400 4401 # Tmp file 4402 tmp_annotate_vcf = NamedTemporaryFile( 4403 prefix=self.get_prefix(), 4404 dir=self.get_tmp_dir(), 4405 suffix=".vcf.gz", 4406 delete=True, 4407 ) 4408 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4409 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4410 err_files.append(tmp_annotate_vcf_name_err) 4411 4412 # Tmp file remove command 4413 tmp_files_remove_command = "" 4414 if tmp_files: 4415 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4416 4417 # Command merge 4418 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4419 log.info( 4420 f"Annotation - Annotation merging " 4421 + str(len(commands)) 4422 + " annotated files" 4423 ) 4424 log.debug(f"Annotation - merge command: {merge_command}") 4425 run_parallel_commands([merge_command], 1) 4426 4427 # Error messages 4428 log.info(f"Error/Warning messages:") 4429 error_message_command_all = [] 4430 error_message_command_warning = [] 4431 error_message_command_err = [] 4432 for err_file in err_files: 4433 with open(err_file, "r") as f: 4434 for line in f: 4435 message = line.strip() 4436 error_message_command_all.append(message) 4437 if line.startswith("[W::"): 4438 error_message_command_warning.append(message) 4439 if line.startswith("[E::"): 4440 error_message_command_err.append( 4441 f"{err_file}: " + message 4442 ) 4443 # log info 4444 for message in list( 4445 set(error_message_command_err + error_message_command_warning) 4446 ): 4447 log.info(f" {message}") 4448 # debug info 4449 for message in list(set(error_message_command_all)): 4450 log.debug(f" {message}") 4451 # failed 4452 if len(error_message_command_err): 4453 log.error("Annotation failed: Error in commands") 4454 raise ValueError("Annotation failed: Error in commands") 4455 4456 # Update variants 4457 log.info(f"Annotation - Updating...") 4458 self.update_from_vcf(tmp_annotate_vcf_name) 4459 4460 def annotation_exomiser(self, threads: int = None) -> None: 4461 """ 4462 This function annotate with Exomiser 4463 4464 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4465 - "analysis" (dict/file): 4466 Full analysis dictionnary parameters (see Exomiser docs). 4467 Either a dict, or a file in JSON or YAML format. 4468 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4469 Default : None 4470 - "preset" (string): 4471 Analysis preset (available in config folder). 4472 Used if no full "analysis" is provided. 4473 Default: "exome" 4474 - "phenopacket" (dict/file): 4475 Samples and phenotipic features parameters (see Exomiser docs). 4476 Either a dict, or a file in JSON or YAML format. 4477 Default: None 4478 - "subject" (dict): 4479 Sample parameters (see Exomiser docs). 4480 Example: 4481 "subject": 4482 { 4483 "id": "ISDBM322017", 4484 "sex": "FEMALE" 4485 } 4486 Default: None 4487 - "sample" (string): 4488 Sample name to construct "subject" section: 4489 "subject": 4490 { 4491 "id": "<sample>", 4492 "sex": "UNKNOWN_SEX" 4493 } 4494 Default: None 4495 - "phenotypicFeatures" (dict) 4496 Phenotypic features to construct "subject" section. 4497 Example: 4498 "phenotypicFeatures": 4499 [ 4500 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4501 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4502 ] 4503 - "hpo" (list) 4504 List of HPO ids as phenotypic features. 4505 Example: 4506 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4507 Default: [] 4508 - "outputOptions" (dict): 4509 Output options (see Exomiser docs). 4510 Default: 4511 "output_options" = 4512 { 4513 "outputContributingVariantsOnly": False, 4514 "numGenes": 0, 4515 "outputFormats": ["TSV_VARIANT", "VCF"] 4516 } 4517 - "transcript_source" (string): 4518 Transcript source (either "refseq", "ucsc", "ensembl") 4519 Default: "refseq" 4520 - "exomiser_to_info" (boolean): 4521 Add exomiser TSV file columns as INFO fields in VCF. 4522 Default: False 4523 - "release" (string): 4524 Exomise database release. 4525 If not exists, database release will be downloaded (take a while). 4526 Default: None (provided by application.properties configuration file) 4527 - "exomiser_application_properties" (file): 4528 Exomiser configuration file (see Exomiser docs). 4529 Useful to automatically download databases (especially for specific genome databases). 4530 4531 Notes: 4532 - If no sample in parameters, first sample in VCF will be chosen 4533 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4534 4535 :param threads: The number of threads to use 4536 :return: None. 4537 """ 4538 4539 # DEBUG 4540 log.debug("Start annotation with Exomiser databases") 4541 4542 # Threads 4543 if not threads: 4544 threads = self.get_threads() 4545 log.debug("Threads: " + str(threads)) 4546 4547 # Config 4548 config = self.get_config() 4549 log.debug("Config: " + str(config)) 4550 4551 # Config - Folders - Databases 4552 databases_folders = ( 4553 config.get("folders", {}) 4554 .get("databases", {}) 4555 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4556 ) 4557 databases_folders = full_path(databases_folders) 4558 if not os.path.exists(databases_folders): 4559 log.error(f"Databases annotations: {databases_folders} NOT found") 4560 log.debug("Databases annotations: " + str(databases_folders)) 4561 4562 # Config - Exomiser 4563 exomiser_bin_command = get_bin_command( 4564 bin="exomiser-cli*.jar", 4565 tool="exomiser", 4566 bin_type="jar", 4567 config=config, 4568 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4569 ) 4570 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4571 if not exomiser_bin_command: 4572 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4573 log.error(msg_err) 4574 raise ValueError(msg_err) 4575 4576 # Param 4577 param = self.get_param() 4578 log.debug("Param: " + str(param)) 4579 4580 # Param - Exomiser 4581 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4582 log.debug(f"Param Exomiser: {param_exomiser}") 4583 4584 # Param - Assembly 4585 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4586 log.debug("Assembly: " + str(assembly)) 4587 4588 # Data 4589 table_variants = self.get_table_variants() 4590 4591 # Check if not empty 4592 log.debug("Check if not empty") 4593 sql_query_chromosomes = ( 4594 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4595 ) 4596 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4597 log.info(f"VCF empty") 4598 return False 4599 4600 # VCF header 4601 vcf_reader = self.get_header() 4602 log.debug("Initial header: " + str(vcf_reader.infos)) 4603 4604 # Samples 4605 samples = self.get_header_sample_list() 4606 if not samples: 4607 log.error("No Samples in VCF") 4608 return False 4609 log.debug(f"Samples: {samples}") 4610 4611 # Memory limit 4612 memory_limit = self.get_memory("8G") 4613 log.debug(f"memory_limit: {memory_limit}") 4614 4615 # Exomiser java options 4616 exomiser_java_options = ( 4617 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4618 ) 4619 log.debug(f"Exomiser java options: {exomiser_java_options}") 4620 4621 # Download Exomiser (if not exists) 4622 exomiser_release = param_exomiser.get("release", None) 4623 exomiser_application_properties = param_exomiser.get( 4624 "exomiser_application_properties", None 4625 ) 4626 databases_download_exomiser( 4627 assemblies=[assembly], 4628 exomiser_folder=databases_folders, 4629 exomiser_release=exomiser_release, 4630 exomiser_phenotype_release=exomiser_release, 4631 exomiser_application_properties=exomiser_application_properties, 4632 ) 4633 4634 # Force annotation 4635 force_update_annotation = True 4636 4637 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4638 log.debug("Start annotation Exomiser") 4639 4640 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4641 4642 # tmp_dir = "/tmp/exomiser" 4643 4644 ### ANALYSIS ### 4645 ################ 4646 4647 # Create analysis.json through analysis dict 4648 # either analysis in param or by default 4649 # depending on preset exome/genome) 4650 4651 # Init analysis dict 4652 param_exomiser_analysis_dict = {} 4653 4654 # analysis from param 4655 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4656 param_exomiser_analysis = full_path(param_exomiser_analysis) 4657 4658 # If analysis in param -> load anlaysis json 4659 if param_exomiser_analysis: 4660 4661 # If param analysis is a file and exists 4662 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4663 param_exomiser_analysis 4664 ): 4665 # Load analysis file into analysis dict (either yaml or json) 4666 with open(param_exomiser_analysis) as json_file: 4667 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4668 4669 # If param analysis is a dict 4670 elif isinstance(param_exomiser_analysis, dict): 4671 # Load analysis dict into analysis dict (either yaml or json) 4672 param_exomiser_analysis_dict = param_exomiser_analysis 4673 4674 # Error analysis type 4675 else: 4676 log.error(f"Analysis type unknown. Check param file.") 4677 raise ValueError(f"Analysis type unknown. Check param file.") 4678 4679 # Case no input analysis config file/dict 4680 # Use preset (exome/genome) to open default config file 4681 if not param_exomiser_analysis_dict: 4682 4683 # default preset 4684 default_preset = "exome" 4685 4686 # Get param preset or default preset 4687 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4688 4689 # Try to find if preset is a file 4690 if os.path.exists(param_exomiser_preset): 4691 # Preset file is provided in full path 4692 param_exomiser_analysis_default_config_file = ( 4693 param_exomiser_preset 4694 ) 4695 # elif os.path.exists(full_path(param_exomiser_preset)): 4696 # # Preset file is provided in full path 4697 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4698 elif os.path.exists( 4699 os.path.join(folder_config, param_exomiser_preset) 4700 ): 4701 # Preset file is provided a basename in config folder (can be a path with subfolders) 4702 param_exomiser_analysis_default_config_file = os.path.join( 4703 folder_config, param_exomiser_preset 4704 ) 4705 else: 4706 # Construct preset file 4707 param_exomiser_analysis_default_config_file = os.path.join( 4708 folder_config, 4709 f"preset-{param_exomiser_preset}-analysis.json", 4710 ) 4711 4712 # If preset file exists 4713 param_exomiser_analysis_default_config_file = full_path( 4714 param_exomiser_analysis_default_config_file 4715 ) 4716 if os.path.exists(param_exomiser_analysis_default_config_file): 4717 # Load prest file into analysis dict (either yaml or json) 4718 with open( 4719 param_exomiser_analysis_default_config_file 4720 ) as json_file: 4721 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4722 json_file 4723 ) 4724 4725 # Error preset file 4726 else: 4727 log.error( 4728 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4729 ) 4730 raise ValueError( 4731 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4732 ) 4733 4734 # If no analysis dict created 4735 if not param_exomiser_analysis_dict: 4736 log.error(f"No analysis config") 4737 raise ValueError(f"No analysis config") 4738 4739 # Log 4740 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4741 4742 ### PHENOPACKET ### 4743 ################### 4744 4745 # If no PhenoPacket in analysis dict -> check in param 4746 if "phenopacket" not in param_exomiser_analysis_dict: 4747 4748 # If PhenoPacket in param -> load anlaysis json 4749 if param_exomiser.get("phenopacket", None): 4750 4751 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4752 param_exomiser_phenopacket = full_path( 4753 param_exomiser_phenopacket 4754 ) 4755 4756 # If param phenopacket is a file and exists 4757 if isinstance( 4758 param_exomiser_phenopacket, str 4759 ) and os.path.exists(param_exomiser_phenopacket): 4760 # Load phenopacket file into analysis dict (either yaml or json) 4761 with open(param_exomiser_phenopacket) as json_file: 4762 param_exomiser_analysis_dict["phenopacket"] = ( 4763 yaml.safe_load(json_file) 4764 ) 4765 4766 # If param phenopacket is a dict 4767 elif isinstance(param_exomiser_phenopacket, dict): 4768 # Load phenopacket dict into analysis dict (either yaml or json) 4769 param_exomiser_analysis_dict["phenopacket"] = ( 4770 param_exomiser_phenopacket 4771 ) 4772 4773 # Error phenopacket type 4774 else: 4775 log.error(f"Phenopacket type unknown. Check param file.") 4776 raise ValueError( 4777 f"Phenopacket type unknown. Check param file." 4778 ) 4779 4780 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4781 if "phenopacket" not in param_exomiser_analysis_dict: 4782 4783 # Init PhenoPacket 4784 param_exomiser_analysis_dict["phenopacket"] = { 4785 "id": "analysis", 4786 "proband": {}, 4787 } 4788 4789 ### Add subject ### 4790 4791 # If subject exists 4792 param_exomiser_subject = param_exomiser.get("subject", {}) 4793 4794 # If subject not exists -> found sample ID 4795 if not param_exomiser_subject: 4796 4797 # Found sample ID in param 4798 sample = param_exomiser.get("sample", None) 4799 4800 # Find sample ID (first sample) 4801 if not sample: 4802 sample_list = self.get_header_sample_list() 4803 if len(sample_list) > 0: 4804 sample = sample_list[0] 4805 else: 4806 log.error(f"No sample found") 4807 raise ValueError(f"No sample found") 4808 4809 # Create subject 4810 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4811 4812 # Add to dict 4813 param_exomiser_analysis_dict["phenopacket"][ 4814 "subject" 4815 ] = param_exomiser_subject 4816 4817 ### Add "phenotypicFeatures" ### 4818 4819 # If phenotypicFeatures exists 4820 param_exomiser_phenotypicfeatures = param_exomiser.get( 4821 "phenotypicFeatures", [] 4822 ) 4823 4824 # If phenotypicFeatures not exists -> Try to infer from hpo list 4825 if not param_exomiser_phenotypicfeatures: 4826 4827 # Found HPO in param 4828 param_exomiser_hpo = param_exomiser.get("hpo", []) 4829 4830 # Split HPO if list in string format separated by comma 4831 if isinstance(param_exomiser_hpo, str): 4832 param_exomiser_hpo = param_exomiser_hpo.split(",") 4833 4834 # Create HPO list 4835 for hpo in param_exomiser_hpo: 4836 hpo_clean = re.sub("[^0-9]", "", hpo) 4837 param_exomiser_phenotypicfeatures.append( 4838 { 4839 "type": { 4840 "id": f"HP:{hpo_clean}", 4841 "label": f"HP:{hpo_clean}", 4842 } 4843 } 4844 ) 4845 4846 # Add to dict 4847 param_exomiser_analysis_dict["phenopacket"][ 4848 "phenotypicFeatures" 4849 ] = param_exomiser_phenotypicfeatures 4850 4851 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4852 if not param_exomiser_phenotypicfeatures: 4853 for step in param_exomiser_analysis_dict.get( 4854 "analysis", {} 4855 ).get("steps", []): 4856 if "hiPhivePrioritiser" in step: 4857 param_exomiser_analysis_dict.get("analysis", {}).get( 4858 "steps", [] 4859 ).remove(step) 4860 4861 ### Add Input File ### 4862 4863 # Initial file name and htsFiles 4864 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4865 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4866 { 4867 "uri": tmp_vcf_name, 4868 "htsFormat": "VCF", 4869 "genomeAssembly": assembly, 4870 } 4871 ] 4872 4873 ### Add metaData ### 4874 4875 # If metaData not in analysis dict 4876 if "metaData" not in param_exomiser_analysis_dict: 4877 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4878 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4879 "createdBy": "howard", 4880 "phenopacketSchemaVersion": 1, 4881 } 4882 4883 ### OutputOptions ### 4884 4885 # Init output result folder 4886 output_results = os.path.join(tmp_dir, "results") 4887 4888 # If no outputOptions in analysis dict 4889 if "outputOptions" not in param_exomiser_analysis_dict: 4890 4891 # default output formats 4892 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4893 4894 # Get outputOptions in param 4895 output_options = param_exomiser.get("outputOptions", None) 4896 4897 # If no output_options in param -> check 4898 if not output_options: 4899 output_options = { 4900 "outputContributingVariantsOnly": False, 4901 "numGenes": 0, 4902 "outputFormats": defaut_output_formats, 4903 } 4904 4905 # Replace outputDirectory in output options 4906 output_options["outputDirectory"] = output_results 4907 output_options["outputFileName"] = "howard" 4908 4909 # Add outputOptions in analysis dict 4910 param_exomiser_analysis_dict["outputOptions"] = output_options 4911 4912 else: 4913 4914 # Replace output_results and output format (if exists in param) 4915 param_exomiser_analysis_dict["outputOptions"][ 4916 "outputDirectory" 4917 ] = output_results 4918 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4919 list( 4920 set( 4921 param_exomiser_analysis_dict.get( 4922 "outputOptions", {} 4923 ).get("outputFormats", []) 4924 + ["TSV_VARIANT", "VCF"] 4925 ) 4926 ) 4927 ) 4928 4929 # log 4930 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4931 4932 ### ANALYSIS FILE ### 4933 ##################### 4934 4935 ### Full JSON analysis config file ### 4936 4937 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4938 with open(exomiser_analysis, "w") as fp: 4939 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4940 4941 ### SPLIT analysis and sample config files 4942 4943 # Splitted analysis dict 4944 param_exomiser_analysis_dict_for_split = ( 4945 param_exomiser_analysis_dict.copy() 4946 ) 4947 4948 # Phenopacket JSON file 4949 exomiser_analysis_phenopacket = os.path.join( 4950 tmp_dir, "analysis_phenopacket.json" 4951 ) 4952 with open(exomiser_analysis_phenopacket, "w") as fp: 4953 json.dump( 4954 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4955 fp, 4956 indent=4, 4957 ) 4958 4959 # Analysis JSON file without Phenopacket parameters 4960 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4961 exomiser_analysis_analysis = os.path.join( 4962 tmp_dir, "analysis_analysis.json" 4963 ) 4964 with open(exomiser_analysis_analysis, "w") as fp: 4965 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4966 4967 ### INITAL VCF file ### 4968 ####################### 4969 4970 ### Create list of samples to use and include inti initial VCF file #### 4971 4972 # Subject (main sample) 4973 # Get sample ID in analysis dict 4974 sample_subject = ( 4975 param_exomiser_analysis_dict.get("phenopacket", {}) 4976 .get("subject", {}) 4977 .get("id", None) 4978 ) 4979 sample_proband = ( 4980 param_exomiser_analysis_dict.get("phenopacket", {}) 4981 .get("proband", {}) 4982 .get("subject", {}) 4983 .get("id", None) 4984 ) 4985 sample = [] 4986 if sample_subject: 4987 sample.append(sample_subject) 4988 if sample_proband: 4989 sample.append(sample_proband) 4990 4991 # Get sample ID within Pedigree 4992 pedigree_persons_list = ( 4993 param_exomiser_analysis_dict.get("phenopacket", {}) 4994 .get("pedigree", {}) 4995 .get("persons", {}) 4996 ) 4997 4998 # Create list with all sample ID in pedigree (if exists) 4999 pedigree_persons = [] 5000 for person in pedigree_persons_list: 5001 pedigree_persons.append(person.get("individualId")) 5002 5003 # Concat subject sample ID and samples ID in pedigreesamples 5004 samples = list(set(sample + pedigree_persons)) 5005 5006 # Check if sample list is not empty 5007 if not samples: 5008 log.error(f"No samples found") 5009 raise ValueError(f"No samples found") 5010 5011 # Create VCF with sample (either sample in param or first one by default) 5012 # Export VCF file 5013 self.export_variant_vcf( 5014 vcf_file=tmp_vcf_name, 5015 remove_info=True, 5016 add_samples=True, 5017 list_samples=samples, 5018 index=False, 5019 ) 5020 5021 ### Execute Exomiser ### 5022 ######################## 5023 5024 # Init command 5025 exomiser_command = "" 5026 5027 # Command exomiser options 5028 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5029 5030 # Release 5031 exomiser_release = param_exomiser.get("release", None) 5032 if exomiser_release: 5033 # phenotype data version 5034 exomiser_options += ( 5035 f" --exomiser.phenotype.data-version={exomiser_release} " 5036 ) 5037 # data version 5038 exomiser_options += ( 5039 f" --exomiser.{assembly}.data-version={exomiser_release} " 5040 ) 5041 # variant white list 5042 variant_white_list_file = ( 5043 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5044 ) 5045 if os.path.exists( 5046 os.path.join( 5047 databases_folders, assembly, variant_white_list_file 5048 ) 5049 ): 5050 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5051 5052 # transcript_source 5053 transcript_source = param_exomiser.get( 5054 "transcript_source", None 5055 ) # ucsc, refseq, ensembl 5056 if transcript_source: 5057 exomiser_options += ( 5058 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5059 ) 5060 5061 # If analysis contain proband param 5062 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5063 "proband", {} 5064 ): 5065 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5066 5067 # If no proband (usually uniq sample) 5068 else: 5069 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5070 5071 # Log 5072 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5073 5074 # Run command 5075 result = subprocess.call( 5076 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5077 ) 5078 if result: 5079 log.error("Exomiser command failed") 5080 raise ValueError("Exomiser command failed") 5081 5082 ### RESULTS ### 5083 ############### 5084 5085 ### Annotate with TSV fields ### 5086 5087 # Init result tsv file 5088 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5089 5090 # Init result tsv file 5091 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5092 5093 # Parse TSV file and explode columns in INFO field 5094 if exomiser_to_info and os.path.exists(output_results_tsv): 5095 5096 # Log 5097 log.debug("Exomiser columns to VCF INFO field") 5098 5099 # Retrieve columns and types 5100 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5101 output_results_tsv_df = self.get_query_to_df(query) 5102 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5103 5104 # Init concat fields for update 5105 sql_query_update_concat_fields = [] 5106 5107 # Fields to avoid 5108 fields_to_avoid = [ 5109 "CONTIG", 5110 "START", 5111 "END", 5112 "REF", 5113 "ALT", 5114 "QUAL", 5115 "FILTER", 5116 "GENOTYPE", 5117 ] 5118 5119 # List all columns to add into header 5120 for header_column in output_results_tsv_columns: 5121 5122 # If header column is enable 5123 if header_column not in fields_to_avoid: 5124 5125 # Header info type 5126 header_info_type = "String" 5127 header_column_df = output_results_tsv_df[header_column] 5128 header_column_df_dtype = header_column_df.dtype 5129 if header_column_df_dtype == object: 5130 if ( 5131 pd.to_numeric(header_column_df, errors="coerce") 5132 .notnull() 5133 .all() 5134 ): 5135 header_info_type = "Float" 5136 else: 5137 header_info_type = "Integer" 5138 5139 # Header info 5140 characters_to_validate = ["-"] 5141 pattern = "[" + "".join(characters_to_validate) + "]" 5142 header_info_name = re.sub( 5143 pattern, 5144 "_", 5145 f"Exomiser_{header_column}".replace("#", ""), 5146 ) 5147 header_info_number = "." 5148 header_info_description = ( 5149 f"Exomiser {header_column} annotation" 5150 ) 5151 header_info_source = "Exomiser" 5152 header_info_version = "unknown" 5153 header_info_code = CODE_TYPE_MAP[header_info_type] 5154 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5155 header_info_name, 5156 header_info_number, 5157 header_info_type, 5158 header_info_description, 5159 header_info_source, 5160 header_info_version, 5161 header_info_code, 5162 ) 5163 5164 # Add field to add for update to concat fields 5165 sql_query_update_concat_fields.append( 5166 f""" 5167 CASE 5168 WHEN table_parquet."{header_column}" NOT IN ('','.') 5169 THEN concat( 5170 '{header_info_name}=', 5171 table_parquet."{header_column}", 5172 ';' 5173 ) 5174 5175 ELSE '' 5176 END 5177 """ 5178 ) 5179 5180 # Update query 5181 sql_query_update = f""" 5182 UPDATE {table_variants} as table_variants 5183 SET INFO = concat( 5184 CASE 5185 WHEN INFO NOT IN ('', '.') 5186 THEN INFO 5187 ELSE '' 5188 END, 5189 CASE 5190 WHEN table_variants.INFO NOT IN ('','.') 5191 THEN ';' 5192 ELSE '' 5193 END, 5194 ( 5195 SELECT 5196 concat( 5197 {",".join(sql_query_update_concat_fields)} 5198 ) 5199 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5200 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5201 AND table_parquet.\"START\" = table_variants.\"POS\" 5202 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5203 AND table_parquet.\"REF\" = table_variants.\"REF\" 5204 ) 5205 ) 5206 ; 5207 """ 5208 5209 # Update 5210 self.conn.execute(sql_query_update) 5211 5212 ### Annotate with VCF INFO field ### 5213 5214 # Init result VCF file 5215 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5216 5217 # If VCF exists 5218 if os.path.exists(output_results_vcf): 5219 5220 # Log 5221 log.debug("Exomiser result VCF update variants") 5222 5223 # Find Exomiser INFO field annotation in header 5224 with gzip.open(output_results_vcf, "rt") as f: 5225 header_list = self.read_vcf_header(f) 5226 exomiser_vcf_header = vcf.Reader( 5227 io.StringIO("\n".join(header_list)) 5228 ) 5229 5230 # Add annotation INFO field to header 5231 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5232 5233 # Update variants with VCF 5234 self.update_from_vcf(output_results_vcf) 5235 5236 return True 5237 5238 def annotation_snpeff(self, threads: int = None) -> None: 5239 """ 5240 This function annotate with snpEff 5241 5242 :param threads: The number of threads to use 5243 :return: the value of the variable "return_value". 5244 """ 5245 5246 # DEBUG 5247 log.debug("Start annotation with snpeff databases") 5248 5249 # Threads 5250 if not threads: 5251 threads = self.get_threads() 5252 log.debug("Threads: " + str(threads)) 5253 5254 # DEBUG 5255 delete_tmp = True 5256 if self.get_config().get("verbosity", "warning") in ["debug"]: 5257 delete_tmp = False 5258 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5259 5260 # Config 5261 config = self.get_config() 5262 log.debug("Config: " + str(config)) 5263 5264 # Config - Folders - Databases 5265 databases_folders = ( 5266 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5267 ) 5268 log.debug("Databases annotations: " + str(databases_folders)) 5269 5270 # Config - snpEff bin command 5271 snpeff_bin_command = get_bin_command( 5272 bin="snpEff.jar", 5273 tool="snpeff", 5274 bin_type="jar", 5275 config=config, 5276 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5277 ) 5278 if not snpeff_bin_command: 5279 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5280 log.error(msg_err) 5281 raise ValueError(msg_err) 5282 5283 # Config - snpEff databases 5284 snpeff_databases = ( 5285 config.get("folders", {}) 5286 .get("databases", {}) 5287 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5288 ) 5289 snpeff_databases = full_path(snpeff_databases) 5290 if snpeff_databases is not None and snpeff_databases != "": 5291 log.debug(f"Create snpEff databases folder") 5292 if not os.path.exists(snpeff_databases): 5293 os.makedirs(snpeff_databases) 5294 5295 # Param 5296 param = self.get_param() 5297 log.debug("Param: " + str(param)) 5298 5299 # Param 5300 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5301 log.debug("Options: " + str(options)) 5302 5303 # Param - Assembly 5304 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5305 5306 # Param - Options 5307 snpeff_options = ( 5308 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5309 ) 5310 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5311 snpeff_csvstats = ( 5312 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5313 ) 5314 if snpeff_stats: 5315 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5316 snpeff_stats = full_path(snpeff_stats) 5317 snpeff_options += f" -stats {snpeff_stats}" 5318 if snpeff_csvstats: 5319 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5320 snpeff_csvstats = full_path(snpeff_csvstats) 5321 snpeff_options += f" -csvStats {snpeff_csvstats}" 5322 5323 # Data 5324 table_variants = self.get_table_variants() 5325 5326 # Check if not empty 5327 log.debug("Check if not empty") 5328 sql_query_chromosomes = ( 5329 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5330 ) 5331 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5332 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5333 log.info(f"VCF empty") 5334 return 5335 5336 # Export in VCF 5337 log.debug("Create initial file to annotate") 5338 tmp_vcf = NamedTemporaryFile( 5339 prefix=self.get_prefix(), 5340 dir=self.get_tmp_dir(), 5341 suffix=".vcf.gz", 5342 delete=True, 5343 ) 5344 tmp_vcf_name = tmp_vcf.name 5345 5346 # VCF header 5347 vcf_reader = self.get_header() 5348 log.debug("Initial header: " + str(vcf_reader.infos)) 5349 5350 # Existing annotations 5351 for vcf_annotation in self.get_header().infos: 5352 5353 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5354 log.debug( 5355 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5356 ) 5357 5358 # Memory limit 5359 # if config.get("memory", None): 5360 # memory_limit = config.get("memory", "8G") 5361 # else: 5362 # memory_limit = "8G" 5363 memory_limit = self.get_memory("8G") 5364 log.debug(f"memory_limit: {memory_limit}") 5365 5366 # snpEff java options 5367 snpeff_java_options = ( 5368 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5369 ) 5370 log.debug(f"Exomiser java options: {snpeff_java_options}") 5371 5372 force_update_annotation = True 5373 5374 if "ANN" not in self.get_header().infos or force_update_annotation: 5375 5376 # Check snpEff database 5377 log.debug(f"Check snpEff databases {[assembly]}") 5378 databases_download_snpeff( 5379 folder=snpeff_databases, assemblies=[assembly], config=config 5380 ) 5381 5382 # Export VCF file 5383 self.export_variant_vcf( 5384 vcf_file=tmp_vcf_name, 5385 remove_info=True, 5386 add_samples=False, 5387 index=True, 5388 ) 5389 5390 # Tmp file 5391 err_files = [] 5392 tmp_annotate_vcf = NamedTemporaryFile( 5393 prefix=self.get_prefix(), 5394 dir=self.get_tmp_dir(), 5395 suffix=".vcf", 5396 delete=False, 5397 ) 5398 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5399 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5400 err_files.append(tmp_annotate_vcf_name_err) 5401 5402 # Command 5403 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5404 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5405 run_parallel_commands([snpeff_command], 1) 5406 5407 # Error messages 5408 log.info(f"Error/Warning messages:") 5409 error_message_command_all = [] 5410 error_message_command_warning = [] 5411 error_message_command_err = [] 5412 for err_file in err_files: 5413 with open(err_file, "r") as f: 5414 for line in f: 5415 message = line.strip() 5416 error_message_command_all.append(message) 5417 if line.startswith("[W::"): 5418 error_message_command_warning.append(message) 5419 if line.startswith("[E::"): 5420 error_message_command_err.append(f"{err_file}: " + message) 5421 # log info 5422 for message in list( 5423 set(error_message_command_err + error_message_command_warning) 5424 ): 5425 log.info(f" {message}") 5426 # debug info 5427 for message in list(set(error_message_command_all)): 5428 log.debug(f" {message}") 5429 # failed 5430 if len(error_message_command_err): 5431 log.error("Annotation failed: Error in commands") 5432 raise ValueError("Annotation failed: Error in commands") 5433 5434 # Find annotation in header 5435 with open(tmp_annotate_vcf_name, "rt") as f: 5436 header_list = self.read_vcf_header(f) 5437 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5438 5439 for ann in annovar_vcf_header.infos: 5440 if ann not in self.get_header().infos: 5441 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5442 5443 # Update variants 5444 log.info(f"Annotation - Updating...") 5445 self.update_from_vcf(tmp_annotate_vcf_name) 5446 5447 else: 5448 if "ANN" in self.get_header().infos: 5449 log.debug(f"Existing snpEff annotations in VCF") 5450 if force_update_annotation: 5451 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5452 5453 def annotation_annovar(self, threads: int = None) -> None: 5454 """ 5455 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5456 annotations 5457 5458 :param threads: number of threads to use 5459 :return: the value of the variable "return_value". 5460 """ 5461 5462 # DEBUG 5463 log.debug("Start annotation with Annovar databases") 5464 5465 # Threads 5466 if not threads: 5467 threads = self.get_threads() 5468 log.debug("Threads: " + str(threads)) 5469 5470 # Tmp en Err files 5471 tmp_files = [] 5472 err_files = [] 5473 5474 # DEBUG 5475 delete_tmp = True 5476 if self.get_config().get("verbosity", "warning") in ["debug"]: 5477 delete_tmp = False 5478 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5479 5480 # Config 5481 config = self.get_config() 5482 log.debug("Config: " + str(config)) 5483 5484 # Config - Folders - Databases 5485 databases_folders = ( 5486 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5487 ) 5488 log.debug("Databases annotations: " + str(databases_folders)) 5489 5490 # Config - annovar bin command 5491 annovar_bin_command = get_bin_command( 5492 bin="table_annovar.pl", 5493 tool="annovar", 5494 bin_type="perl", 5495 config=config, 5496 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5497 ) 5498 if not annovar_bin_command: 5499 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5500 log.error(msg_err) 5501 raise ValueError(msg_err) 5502 5503 # Config - BCFTools bin command 5504 bcftools_bin_command = get_bin_command( 5505 bin="bcftools", 5506 tool="bcftools", 5507 bin_type="bin", 5508 config=config, 5509 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5510 ) 5511 if not bcftools_bin_command: 5512 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5513 log.error(msg_err) 5514 raise ValueError(msg_err) 5515 5516 # Config - annovar databases 5517 annovar_databases = ( 5518 config.get("folders", {}) 5519 .get("databases", {}) 5520 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5521 ) 5522 if annovar_databases is not None: 5523 if isinstance(annovar_databases, list): 5524 annovar_databases = full_path(annovar_databases[0]) 5525 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5526 annovar_databases = full_path(annovar_databases) 5527 if not os.path.exists(annovar_databases): 5528 log.info(f"Annovar databases folder '{annovar_databases}' created") 5529 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5530 else: 5531 msg_err = f"Annovar databases configuration failed" 5532 log.error(msg_err) 5533 raise ValueError(msg_err) 5534 5535 # Param 5536 param = self.get_param() 5537 log.debug("Param: " + str(param)) 5538 5539 # Param - options 5540 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5541 log.debug("Options: " + str(options)) 5542 5543 # Param - annotations 5544 annotations = ( 5545 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5546 ) 5547 log.debug("Annotations: " + str(annotations)) 5548 5549 # Param - Assembly 5550 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5551 5552 # Annovar database assembly 5553 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5554 if annovar_databases_assembly != "" and not os.path.exists( 5555 annovar_databases_assembly 5556 ): 5557 os.makedirs(annovar_databases_assembly) 5558 5559 # Data 5560 table_variants = self.get_table_variants() 5561 5562 # Check if not empty 5563 log.debug("Check if not empty") 5564 sql_query_chromosomes = ( 5565 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5566 ) 5567 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5568 if not sql_query_chromosomes_df["count"][0]: 5569 log.info(f"VCF empty") 5570 return 5571 5572 # VCF header 5573 vcf_reader = self.get_header() 5574 log.debug("Initial header: " + str(vcf_reader.infos)) 5575 5576 # Existing annotations 5577 for vcf_annotation in self.get_header().infos: 5578 5579 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5580 log.debug( 5581 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5582 ) 5583 5584 force_update_annotation = True 5585 5586 if annotations: 5587 5588 commands = [] 5589 tmp_annotates_vcf_name_list = [] 5590 5591 # Export in VCF 5592 log.debug("Create initial file to annotate") 5593 tmp_vcf = NamedTemporaryFile( 5594 prefix=self.get_prefix(), 5595 dir=self.get_tmp_dir(), 5596 suffix=".vcf.gz", 5597 delete=False, 5598 ) 5599 tmp_vcf_name = tmp_vcf.name 5600 tmp_files.append(tmp_vcf_name) 5601 tmp_files.append(tmp_vcf_name + ".tbi") 5602 5603 # Export VCF file 5604 self.export_variant_vcf( 5605 vcf_file=tmp_vcf_name, 5606 remove_info=".", 5607 add_samples=False, 5608 index=True, 5609 ) 5610 5611 # Create file for field rename 5612 log.debug("Create file for field rename") 5613 tmp_rename = NamedTemporaryFile( 5614 prefix=self.get_prefix(), 5615 dir=self.get_tmp_dir(), 5616 suffix=".rename", 5617 delete=False, 5618 ) 5619 tmp_rename_name = tmp_rename.name 5620 tmp_files.append(tmp_rename_name) 5621 5622 # Check Annovar database 5623 log.debug( 5624 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5625 ) 5626 databases_download_annovar( 5627 folder=annovar_databases, 5628 files=list(annotations.keys()), 5629 assemblies=[assembly], 5630 ) 5631 5632 for annotation in annotations: 5633 annotation_fields = annotations[annotation] 5634 5635 if not annotation_fields: 5636 annotation_fields = {"INFO": None} 5637 5638 log.info(f"Annotations Annovar - database '{annotation}'") 5639 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5640 5641 # Tmp file for annovar 5642 err_files = [] 5643 tmp_annotate_vcf_directory = TemporaryDirectory( 5644 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5645 ) 5646 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5647 tmp_annotate_vcf_name_annovar = ( 5648 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5649 ) 5650 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5651 err_files.append(tmp_annotate_vcf_name_err) 5652 tmp_files.append(tmp_annotate_vcf_name_err) 5653 5654 # Tmp file final vcf annotated by annovar 5655 tmp_annotate_vcf = NamedTemporaryFile( 5656 prefix=self.get_prefix(), 5657 dir=self.get_tmp_dir(), 5658 suffix=".vcf.gz", 5659 delete=False, 5660 ) 5661 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5662 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5663 tmp_files.append(tmp_annotate_vcf_name) 5664 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5665 5666 # Number of fields 5667 annotation_list = [] 5668 annotation_renamed_list = [] 5669 5670 for annotation_field in annotation_fields: 5671 5672 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5673 annotation_fields_new_name = annotation_fields.get( 5674 annotation_field, annotation_field 5675 ) 5676 if not annotation_fields_new_name: 5677 annotation_fields_new_name = annotation_field 5678 5679 if ( 5680 force_update_annotation 5681 or annotation_fields_new_name not in self.get_header().infos 5682 ): 5683 annotation_list.append(annotation_field) 5684 annotation_renamed_list.append(annotation_fields_new_name) 5685 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5686 log.warning( 5687 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5688 ) 5689 5690 # Add rename info 5691 run_parallel_commands( 5692 [ 5693 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5694 ], 5695 1, 5696 ) 5697 5698 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5699 log.debug("annotation_list: " + str(annotation_list)) 5700 5701 # protocol 5702 protocol = annotation 5703 5704 # argument 5705 argument = "" 5706 5707 # operation 5708 operation = "f" 5709 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5710 "ensGene" 5711 ): 5712 operation = "g" 5713 if options.get("genebase", None): 5714 argument = f"""'{options.get("genebase","")}'""" 5715 elif annotation in ["cytoBand"]: 5716 operation = "r" 5717 5718 # argument option 5719 argument_option = "" 5720 if argument != "": 5721 argument_option = " --argument " + argument 5722 5723 # command options 5724 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5725 for option in options: 5726 if option not in ["genebase"]: 5727 command_options += f""" --{option}={options[option]}""" 5728 5729 # Command 5730 5731 # Command - Annovar 5732 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5733 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5734 5735 # Command - start pipe 5736 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5737 5738 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5739 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5740 5741 # Command - Special characters (refGene annotation) 5742 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5743 5744 # Command - Clean empty fields (with value ".") 5745 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5746 5747 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5748 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5749 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5750 # for ann in annotation_renamed_list: 5751 for ann in annotation_list: 5752 annovar_fields_to_keep.append(f"^INFO/{ann}") 5753 5754 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5755 5756 # Command - indexing 5757 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5758 5759 log.debug(f"Annotation - Annovar command: {command_annovar}") 5760 run_parallel_commands([command_annovar], 1) 5761 5762 # Error messages 5763 log.info(f"Error/Warning messages:") 5764 error_message_command_all = [] 5765 error_message_command_warning = [] 5766 error_message_command_err = [] 5767 for err_file in err_files: 5768 with open(err_file, "r") as f: 5769 for line in f: 5770 message = line.strip() 5771 error_message_command_all.append(message) 5772 if line.startswith("[W::") or line.startswith("WARNING"): 5773 error_message_command_warning.append(message) 5774 if line.startswith("[E::") or line.startswith("ERROR"): 5775 error_message_command_err.append( 5776 f"{err_file}: " + message 5777 ) 5778 # log info 5779 for message in list( 5780 set(error_message_command_err + error_message_command_warning) 5781 ): 5782 log.info(f" {message}") 5783 # debug info 5784 for message in list(set(error_message_command_all)): 5785 log.debug(f" {message}") 5786 # failed 5787 if len(error_message_command_err): 5788 log.error("Annotation failed: Error in commands") 5789 raise ValueError("Annotation failed: Error in commands") 5790 5791 if tmp_annotates_vcf_name_list: 5792 5793 # List of annotated files 5794 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5795 5796 # Tmp file 5797 tmp_annotate_vcf = NamedTemporaryFile( 5798 prefix=self.get_prefix(), 5799 dir=self.get_tmp_dir(), 5800 suffix=".vcf.gz", 5801 delete=False, 5802 ) 5803 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5804 tmp_files.append(tmp_annotate_vcf_name) 5805 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5806 err_files.append(tmp_annotate_vcf_name_err) 5807 tmp_files.append(tmp_annotate_vcf_name_err) 5808 5809 # Command merge 5810 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5811 log.info( 5812 f"Annotation Annovar - Annotation merging " 5813 + str(len(tmp_annotates_vcf_name_list)) 5814 + " annotated files" 5815 ) 5816 log.debug(f"Annotation - merge command: {merge_command}") 5817 run_parallel_commands([merge_command], 1) 5818 5819 # Find annotation in header 5820 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5821 header_list = self.read_vcf_header(f) 5822 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5823 5824 for ann in annovar_vcf_header.infos: 5825 if ann not in self.get_header().infos: 5826 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5827 5828 # Update variants 5829 log.info(f"Annotation Annovar - Updating...") 5830 self.update_from_vcf(tmp_annotate_vcf_name) 5831 5832 # Clean files 5833 # Tmp file remove command 5834 if True: 5835 tmp_files_remove_command = "" 5836 if tmp_files: 5837 tmp_files_remove_command = " ".join(tmp_files) 5838 clean_command = f" rm -f {tmp_files_remove_command} " 5839 log.debug(f"Annotation Annovar - Annotation cleaning ") 5840 log.debug(f"Annotation - cleaning command: {clean_command}") 5841 run_parallel_commands([clean_command], 1) 5842 5843 # Parquet 5844 def annotation_parquet(self, threads: int = None) -> None: 5845 """ 5846 It takes a VCF file, and annotates it with a parquet file 5847 5848 :param threads: number of threads to use for the annotation 5849 :return: the value of the variable "result". 5850 """ 5851 5852 # DEBUG 5853 log.debug("Start annotation with parquet databases") 5854 5855 # Threads 5856 if not threads: 5857 threads = self.get_threads() 5858 log.debug("Threads: " + str(threads)) 5859 5860 # DEBUG 5861 delete_tmp = True 5862 if self.get_config().get("verbosity", "warning") in ["debug"]: 5863 delete_tmp = False 5864 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5865 5866 # Config 5867 databases_folders = set( 5868 self.get_config() 5869 .get("folders", {}) 5870 .get("databases", {}) 5871 .get("annotations", ["."]) 5872 + self.get_config() 5873 .get("folders", {}) 5874 .get("databases", {}) 5875 .get("parquet", ["."]) 5876 ) 5877 log.debug("Databases annotations: " + str(databases_folders)) 5878 5879 # Param 5880 annotations = ( 5881 self.get_param() 5882 .get("annotation", {}) 5883 .get("parquet", {}) 5884 .get("annotations", None) 5885 ) 5886 log.debug("Annotations: " + str(annotations)) 5887 5888 # Assembly 5889 assembly = self.get_param().get( 5890 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5891 ) 5892 5893 # Force Update Annotation 5894 force_update_annotation = ( 5895 self.get_param() 5896 .get("annotation", {}) 5897 .get("options", {}) 5898 .get("annotations_update", False) 5899 ) 5900 log.debug(f"force_update_annotation={force_update_annotation}") 5901 force_append_annotation = ( 5902 self.get_param() 5903 .get("annotation", {}) 5904 .get("options", {}) 5905 .get("annotations_append", False) 5906 ) 5907 log.debug(f"force_append_annotation={force_append_annotation}") 5908 5909 # Data 5910 table_variants = self.get_table_variants() 5911 5912 # Check if not empty 5913 log.debug("Check if not empty") 5914 sql_query_chromosomes_df = self.get_query_to_df( 5915 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5916 ) 5917 if not sql_query_chromosomes_df["count"][0]: 5918 log.info(f"VCF empty") 5919 return 5920 5921 # VCF header 5922 vcf_reader = self.get_header() 5923 log.debug("Initial header: " + str(vcf_reader.infos)) 5924 5925 # Nb Variants POS 5926 log.debug("NB Variants Start") 5927 nb_variants = self.conn.execute( 5928 f"SELECT count(*) AS count FROM variants" 5929 ).fetchdf()["count"][0] 5930 log.debug("NB Variants Stop") 5931 5932 # Existing annotations 5933 for vcf_annotation in self.get_header().infos: 5934 5935 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5936 log.debug( 5937 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5938 ) 5939 5940 # Added columns 5941 added_columns = [] 5942 5943 # drop indexes 5944 log.debug(f"Drop indexes...") 5945 self.drop_indexes() 5946 5947 if annotations: 5948 5949 if "ALL" in annotations: 5950 5951 all_param = annotations.get("ALL", {}) 5952 all_param_formats = all_param.get("formats", None) 5953 all_param_releases = all_param.get("releases", None) 5954 5955 databases_infos_dict = self.scan_databases( 5956 database_formats=all_param_formats, 5957 database_releases=all_param_releases, 5958 ) 5959 for database_infos in databases_infos_dict.keys(): 5960 if database_infos not in annotations: 5961 annotations[database_infos] = {"INFO": None} 5962 5963 for annotation in annotations: 5964 5965 if annotation in ["ALL"]: 5966 continue 5967 5968 # Annotation Name 5969 annotation_name = os.path.basename(annotation) 5970 5971 # Annotation fields 5972 annotation_fields = annotations[annotation] 5973 if not annotation_fields: 5974 annotation_fields = {"INFO": None} 5975 5976 log.debug(f"Annotation '{annotation_name}'") 5977 log.debug( 5978 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5979 ) 5980 5981 # Create Database 5982 database = Database( 5983 database=annotation, 5984 databases_folders=databases_folders, 5985 assembly=assembly, 5986 ) 5987 5988 # Find files 5989 parquet_file = database.get_database() 5990 parquet_hdr_file = database.get_header_file() 5991 parquet_type = database.get_type() 5992 5993 # Check if files exists 5994 if not parquet_file or not parquet_hdr_file: 5995 msg_err_list = [] 5996 if not parquet_file: 5997 msg_err_list.append( 5998 f"Annotation failed: Annotation file not found" 5999 ) 6000 if parquet_file and not parquet_hdr_file: 6001 msg_err_list.append( 6002 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6003 ) 6004 6005 log.error(". ".join(msg_err_list)) 6006 raise ValueError(". ".join(msg_err_list)) 6007 else: 6008 # Get parquet connexion 6009 parquet_sql_attach = database.get_sql_database_attach( 6010 output="query" 6011 ) 6012 if parquet_sql_attach: 6013 self.conn.execute(parquet_sql_attach) 6014 parquet_file_link = database.get_sql_database_link() 6015 # Log 6016 log.debug( 6017 f"Annotation '{annotation_name}' - file: " 6018 + str(parquet_file) 6019 + " and " 6020 + str(parquet_hdr_file) 6021 ) 6022 6023 # Database full header columns 6024 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6025 parquet_hdr_file 6026 ) 6027 # Log 6028 log.debug( 6029 "Annotation database header columns : " 6030 + str(parquet_hdr_vcf_header_columns) 6031 ) 6032 6033 # Load header as VCF object 6034 parquet_hdr_vcf_header_infos = database.get_header().infos 6035 # Log 6036 log.debug( 6037 "Annotation database header: " 6038 + str(parquet_hdr_vcf_header_infos) 6039 ) 6040 6041 # Get extra infos 6042 parquet_columns = database.get_extra_columns() 6043 # Log 6044 log.debug("Annotation database Columns: " + str(parquet_columns)) 6045 6046 # Add extra columns if "ALL" in annotation_fields 6047 # if "ALL" in annotation_fields: 6048 # allow_add_extra_column = True 6049 if "ALL" in annotation_fields and database.get_extra_columns(): 6050 for extra_column in database.get_extra_columns(): 6051 if ( 6052 extra_column not in annotation_fields 6053 and extra_column.replace("INFO/", "") 6054 not in parquet_hdr_vcf_header_infos 6055 ): 6056 parquet_hdr_vcf_header_infos[extra_column] = ( 6057 vcf.parser._Info( 6058 extra_column, 6059 ".", 6060 "String", 6061 f"{extra_column} description", 6062 "unknown", 6063 "unknown", 6064 self.code_type_map["String"], 6065 ) 6066 ) 6067 6068 # For all fields in database 6069 annotation_fields_all = False 6070 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6071 annotation_fields_all = True 6072 annotation_fields = { 6073 key: key for key in parquet_hdr_vcf_header_infos 6074 } 6075 6076 log.debug( 6077 "Annotation database header - All annotations added: " 6078 + str(annotation_fields) 6079 ) 6080 6081 # Init 6082 6083 # List of annotation fields to use 6084 sql_query_annotation_update_info_sets = [] 6085 6086 # List of annotation to agregate 6087 sql_query_annotation_to_agregate = [] 6088 6089 # Number of fields 6090 nb_annotation_field = 0 6091 6092 # Annotation fields processed 6093 annotation_fields_processed = [] 6094 6095 # Columns mapping 6096 map_columns = database.map_columns( 6097 columns=annotation_fields, prefixes=["INFO/"] 6098 ) 6099 6100 # Query dict for fields to remove (update option) 6101 query_dict_remove = {} 6102 6103 # Fetch Anotation fields 6104 for annotation_field in annotation_fields: 6105 6106 # annotation_field_column 6107 annotation_field_column = map_columns.get( 6108 annotation_field, "INFO" 6109 ) 6110 6111 # field new name, if parametered 6112 annotation_fields_new_name = annotation_fields.get( 6113 annotation_field, annotation_field 6114 ) 6115 if not annotation_fields_new_name: 6116 annotation_fields_new_name = annotation_field 6117 6118 # To annotate 6119 # force_update_annotation = True 6120 # force_append_annotation = True 6121 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6122 if annotation_field in parquet_hdr_vcf_header_infos and ( 6123 force_update_annotation 6124 or force_append_annotation 6125 or ( 6126 annotation_fields_new_name 6127 not in self.get_header().infos 6128 ) 6129 ): 6130 6131 # Add field to annotation to process list 6132 annotation_fields_processed.append( 6133 annotation_fields_new_name 6134 ) 6135 6136 # explode infos for the field 6137 annotation_fields_new_name_info_msg = "" 6138 if ( 6139 force_update_annotation 6140 and annotation_fields_new_name 6141 in self.get_header().infos 6142 ): 6143 # Remove field from INFO 6144 query = f""" 6145 UPDATE {table_variants} as table_variants 6146 SET INFO = REGEXP_REPLACE( 6147 concat(table_variants.INFO,''), 6148 ';*{annotation_fields_new_name}=[^;]*', 6149 '' 6150 ) 6151 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6152 """ 6153 annotation_fields_new_name_info_msg = " [update]" 6154 query_dict_remove[ 6155 f"remove 'INFO/{annotation_fields_new_name}'" 6156 ] = query 6157 6158 # Sep between fields in INFO 6159 nb_annotation_field += 1 6160 if nb_annotation_field > 1: 6161 annotation_field_sep = ";" 6162 else: 6163 annotation_field_sep = "" 6164 6165 log.info( 6166 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6167 ) 6168 6169 # Add INFO field to header 6170 parquet_hdr_vcf_header_infos_number = ( 6171 parquet_hdr_vcf_header_infos[annotation_field].num 6172 or "." 6173 ) 6174 parquet_hdr_vcf_header_infos_type = ( 6175 parquet_hdr_vcf_header_infos[annotation_field].type 6176 or "String" 6177 ) 6178 parquet_hdr_vcf_header_infos_description = ( 6179 parquet_hdr_vcf_header_infos[annotation_field].desc 6180 or f"{annotation_field} description" 6181 ) 6182 parquet_hdr_vcf_header_infos_source = ( 6183 parquet_hdr_vcf_header_infos[annotation_field].source 6184 or "unknown" 6185 ) 6186 parquet_hdr_vcf_header_infos_version = ( 6187 parquet_hdr_vcf_header_infos[annotation_field].version 6188 or "unknown" 6189 ) 6190 6191 vcf_reader.infos[annotation_fields_new_name] = ( 6192 vcf.parser._Info( 6193 annotation_fields_new_name, 6194 parquet_hdr_vcf_header_infos_number, 6195 parquet_hdr_vcf_header_infos_type, 6196 parquet_hdr_vcf_header_infos_description, 6197 parquet_hdr_vcf_header_infos_source, 6198 parquet_hdr_vcf_header_infos_version, 6199 self.code_type_map[ 6200 parquet_hdr_vcf_header_infos_type 6201 ], 6202 ) 6203 ) 6204 6205 # Append 6206 if force_append_annotation: 6207 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6208 else: 6209 query_case_when_append = "" 6210 6211 # Annotation/Update query fields 6212 # Found in INFO column 6213 if ( 6214 annotation_field_column == "INFO" 6215 and "INFO" in parquet_hdr_vcf_header_columns 6216 ): 6217 sql_query_annotation_update_info_sets.append( 6218 f""" 6219 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6220 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6221 ELSE '' 6222 END 6223 """ 6224 ) 6225 # Found in a specific column 6226 else: 6227 sql_query_annotation_update_info_sets.append( 6228 f""" 6229 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6230 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6231 ELSE '' 6232 END 6233 """ 6234 ) 6235 sql_query_annotation_to_agregate.append( 6236 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6237 ) 6238 6239 # Not to annotate 6240 else: 6241 6242 if force_update_annotation: 6243 annotation_message = "forced" 6244 else: 6245 annotation_message = "skipped" 6246 6247 if annotation_field not in parquet_hdr_vcf_header_infos: 6248 log.warning( 6249 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6250 ) 6251 if annotation_fields_new_name in self.get_header().infos: 6252 log.warning( 6253 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6254 ) 6255 6256 # Check if ALL fields have to be annotated. Thus concat all INFO field 6257 # allow_annotation_full_info = True 6258 allow_annotation_full_info = not force_append_annotation 6259 6260 if parquet_type in ["regions"]: 6261 allow_annotation_full_info = False 6262 6263 if ( 6264 allow_annotation_full_info 6265 and nb_annotation_field == len(annotation_fields) 6266 and annotation_fields_all 6267 and ( 6268 "INFO" in parquet_hdr_vcf_header_columns 6269 and "INFO" in database.get_extra_columns() 6270 ) 6271 ): 6272 log.debug("Column INFO annotation enabled") 6273 sql_query_annotation_update_info_sets = [] 6274 sql_query_annotation_update_info_sets.append( 6275 f" table_parquet.INFO " 6276 ) 6277 6278 if sql_query_annotation_update_info_sets: 6279 6280 # Annotate 6281 log.info(f"Annotation '{annotation_name}' - Annotation...") 6282 6283 # Join query annotation update info sets for SQL 6284 sql_query_annotation_update_info_sets_sql = ",".join( 6285 sql_query_annotation_update_info_sets 6286 ) 6287 6288 # Check chromosomes list (and variants infos) 6289 sql_query_chromosomes = f""" 6290 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6291 FROM {table_variants} as table_variants 6292 GROUP BY table_variants."#CHROM" 6293 ORDER BY table_variants."#CHROM" 6294 """ 6295 sql_query_chromosomes_df = self.conn.execute( 6296 sql_query_chromosomes 6297 ).df() 6298 sql_query_chromosomes_dict = { 6299 entry["CHROM"]: { 6300 "count": entry["count_variants"], 6301 "min": entry["min_variants"], 6302 "max": entry["max_variants"], 6303 } 6304 for index, entry in sql_query_chromosomes_df.iterrows() 6305 } 6306 6307 # Init 6308 nb_of_query = 0 6309 nb_of_variant_annotated = 0 6310 query_dict = query_dict_remove 6311 6312 # for chrom in sql_query_chromosomes_df["CHROM"]: 6313 for chrom in sql_query_chromosomes_dict: 6314 6315 # Number of variant by chromosome 6316 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6317 chrom, {} 6318 ).get("count", 0) 6319 6320 log.debug( 6321 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6322 ) 6323 6324 # Annotation with regions database 6325 if parquet_type in ["regions"]: 6326 sql_query_annotation_from_clause = f""" 6327 FROM ( 6328 SELECT 6329 '{chrom}' AS \"#CHROM\", 6330 table_variants_from.\"POS\" AS \"POS\", 6331 {",".join(sql_query_annotation_to_agregate)} 6332 FROM {table_variants} as table_variants_from 6333 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6334 table_parquet_from."#CHROM" = '{chrom}' 6335 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6336 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6337 ) 6338 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6339 GROUP BY table_variants_from.\"POS\" 6340 ) 6341 as table_parquet 6342 """ 6343 6344 sql_query_annotation_where_clause = """ 6345 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6346 AND table_parquet.\"POS\" = table_variants.\"POS\" 6347 """ 6348 6349 # Annotation with variants database 6350 else: 6351 sql_query_annotation_from_clause = f""" 6352 FROM {parquet_file_link} as table_parquet 6353 """ 6354 sql_query_annotation_where_clause = f""" 6355 table_variants."#CHROM" = '{chrom}' 6356 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6357 AND table_parquet.\"POS\" = table_variants.\"POS\" 6358 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6359 AND table_parquet.\"REF\" = table_variants.\"REF\" 6360 """ 6361 6362 # Create update query 6363 sql_query_annotation_chrom_interval_pos = f""" 6364 UPDATE {table_variants} as table_variants 6365 SET INFO = 6366 concat( 6367 CASE WHEN table_variants.INFO NOT IN ('','.') 6368 THEN table_variants.INFO 6369 ELSE '' 6370 END 6371 , 6372 CASE WHEN table_variants.INFO NOT IN ('','.') 6373 AND ( 6374 concat({sql_query_annotation_update_info_sets_sql}) 6375 ) 6376 NOT IN ('','.') 6377 THEN ';' 6378 ELSE '' 6379 END 6380 , 6381 {sql_query_annotation_update_info_sets_sql} 6382 ) 6383 {sql_query_annotation_from_clause} 6384 WHERE {sql_query_annotation_where_clause} 6385 ; 6386 """ 6387 6388 # Add update query to dict 6389 query_dict[ 6390 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6391 ] = sql_query_annotation_chrom_interval_pos 6392 6393 nb_of_query = len(query_dict) 6394 num_query = 0 6395 6396 # SET max_expression_depth TO x 6397 self.conn.execute("SET max_expression_depth TO 10000") 6398 6399 for query_name in query_dict: 6400 query = query_dict[query_name] 6401 num_query += 1 6402 log.info( 6403 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6404 ) 6405 result = self.conn.execute(query) 6406 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6407 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6408 log.info( 6409 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6410 ) 6411 6412 log.info( 6413 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6414 ) 6415 6416 else: 6417 6418 log.info( 6419 f"Annotation '{annotation_name}' - No Annotations available" 6420 ) 6421 6422 log.debug("Final header: " + str(vcf_reader.infos)) 6423 6424 # Remove added columns 6425 for added_column in added_columns: 6426 self.drop_column(column=added_column) 6427 6428 def annotation_splice(self, threads: int = None) -> None: 6429 """ 6430 This function annotate with snpEff 6431 6432 :param threads: The number of threads to use 6433 :return: the value of the variable "return_value". 6434 """ 6435 6436 # DEBUG 6437 log.debug("Start annotation with splice tools") 6438 6439 # Threads 6440 if not threads: 6441 threads = self.get_threads() 6442 log.debug("Threads: " + str(threads)) 6443 6444 # DEBUG 6445 delete_tmp = True 6446 if self.get_config().get("verbosity", "warning") in ["debug"]: 6447 delete_tmp = False 6448 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6449 6450 # Config 6451 config = self.get_config() 6452 log.debug("Config: " + str(config)) 6453 splice_config = config.get("tools", {}).get("splice", {}) 6454 if not splice_config: 6455 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6456 msg_err = "No Splice tool config" 6457 raise ValueError(msg_err) 6458 log.debug(f"splice_config: {splice_config}") 6459 6460 # Config - Folders - Databases 6461 databases_folders = ( 6462 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6463 ) 6464 log.debug("Databases annotations: " + str(databases_folders)) 6465 6466 # Splice docker image 6467 splice_docker_image = splice_config.get("docker").get("image") 6468 6469 # Pull splice image if it's not already there 6470 if not check_docker_image_exists(splice_docker_image): 6471 log.warning( 6472 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6473 ) 6474 try: 6475 command(f"docker pull {splice_config.get('docker').get('image')}") 6476 except subprocess.CalledProcessError: 6477 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6478 log.error(msg_err) 6479 raise ValueError(msg_err) 6480 6481 # Config - splice databases 6482 splice_databases = ( 6483 config.get("folders", {}) 6484 .get("databases", {}) 6485 .get("splice", DEFAULT_SPLICE_FOLDER) 6486 ) 6487 splice_databases = full_path(splice_databases) 6488 6489 # Param 6490 param = self.get_param() 6491 log.debug("Param: " + str(param)) 6492 6493 # Param 6494 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6495 log.debug("Options: " + str(options)) 6496 6497 # Data 6498 table_variants = self.get_table_variants() 6499 6500 # Check if not empty 6501 log.debug("Check if not empty") 6502 sql_query_chromosomes = ( 6503 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6504 ) 6505 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6506 log.info("VCF empty") 6507 return None 6508 6509 # Export in VCF 6510 log.debug("Create initial file to annotate") 6511 6512 # Create output folder / work folder 6513 if options.get("output_folder", ""): 6514 output_folder = options.get("output_folder", "") 6515 if not os.path.exists(output_folder): 6516 Path(output_folder).mkdir(parents=True, exist_ok=True) 6517 else: 6518 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6519 if not os.path.exists(output_folder): 6520 Path(output_folder).mkdir(parents=True, exist_ok=True) 6521 6522 if options.get("workdir", ""): 6523 workdir = options.get("workdir", "") 6524 else: 6525 workdir = "/work" 6526 6527 # Create tmp VCF file 6528 tmp_vcf = NamedTemporaryFile( 6529 prefix=self.get_prefix(), 6530 dir=output_folder, 6531 suffix=".vcf", 6532 delete=False, 6533 ) 6534 tmp_vcf_name = tmp_vcf.name 6535 6536 # VCF header 6537 header = self.get_header() 6538 6539 # Existing annotations 6540 for vcf_annotation in self.get_header().infos: 6541 6542 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6543 log.debug( 6544 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6545 ) 6546 6547 # Memory limit 6548 if config.get("memory", None): 6549 memory_limit = config.get("memory", "8G").upper() 6550 # upper() 6551 else: 6552 memory_limit = "8G" 6553 log.debug(f"memory_limit: {memory_limit}") 6554 6555 # Check number of variants to annotate 6556 where_clause_regex_spliceai = r"SpliceAI_\w+" 6557 where_clause_regex_spip = r"SPiP_\w+" 6558 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6559 df_list_of_variants_to_annotate = self.get_query_to_df( 6560 query=f""" SELECT * FROM variants {where_clause} """ 6561 ) 6562 if len(df_list_of_variants_to_annotate) == 0: 6563 log.warning( 6564 f"No variants to annotate with splice. Variants probably already annotated with splice" 6565 ) 6566 return None 6567 else: 6568 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6569 6570 # Export VCF file 6571 self.export_variant_vcf( 6572 vcf_file=tmp_vcf_name, 6573 remove_info=True, 6574 add_samples=True, 6575 index=False, 6576 where_clause=where_clause, 6577 ) 6578 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6579 if any(value for value in splice_config.values() if value is None): 6580 log.warning("At least one splice config parameter is empty") 6581 # exit annotation_splice 6582 return None 6583 6584 # Params in splice nf 6585 def check_values(dico: dict): 6586 """ 6587 Ensure parameters for NF splice pipeline 6588 """ 6589 for key, val in dico.items(): 6590 if key == "genome": 6591 if any( 6592 assemb in options.get("genome", {}) 6593 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6594 ): 6595 yield f"--{key} hg19" 6596 elif any( 6597 assemb in options.get("genome", {}) 6598 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6599 ): 6600 yield f"--{key} hg38" 6601 elif ( 6602 (isinstance(val, str) and val) 6603 or isinstance(val, int) 6604 or isinstance(val, bool) 6605 ): 6606 yield f"--{key} {val}" 6607 6608 # Genome 6609 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6610 options["genome"] = genome 6611 # NF params 6612 nf_params = [] 6613 # Add options 6614 if options: 6615 log.debug(options) 6616 nf_params = list(check_values(options)) 6617 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6618 else: 6619 log.debug("No NF params provided") 6620 # Add threads 6621 if "threads" not in options.keys(): 6622 nf_params.append(f"--threads {threads}") 6623 # Genome path 6624 genome_path = find_genome( 6625 config.get("folders", {}) 6626 .get("databases", {}) 6627 .get("genomes", DEFAULT_GENOME_FOLDER), 6628 file=f"{genome}.fa", 6629 ) 6630 # Add genome path 6631 if not genome_path: 6632 raise ValueError( 6633 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6634 ) 6635 else: 6636 log.debug(f"Genome: {genome_path}") 6637 nf_params.append(f"--genome_path {genome_path}") 6638 6639 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6640 """ 6641 Setting up updated databases for SPiP and SpliceAI 6642 """ 6643 6644 try: 6645 6646 # SpliceAI assembly transcriptome 6647 spliceai_assembly = os.path.join( 6648 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6649 options.get("genome"), 6650 "transcriptome", 6651 ) 6652 spip_assembly = options.get("genome") 6653 6654 spip = find( 6655 f"transcriptome_{spip_assembly}.RData", 6656 config.get("folders", {}).get("databases", {}).get("spip", {}), 6657 ) 6658 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6659 log.debug(f"SPiP annotations: {spip}") 6660 log.debug(f"SpliceAI annotations: {spliceai}") 6661 if spip and spliceai: 6662 return [ 6663 f"--spip_transcriptome {spip}", 6664 f"--spliceai_transcriptome {spliceai}", 6665 ] 6666 else: 6667 log.warning( 6668 "Can't find splice databases in configuration, use annotations file from image" 6669 ) 6670 except TypeError: 6671 log.warning( 6672 "Can't find splice databases in configuration, use annotations file from image" 6673 ) 6674 return [] 6675 6676 # Add options, check if transcriptome option have already beend provided 6677 if ( 6678 "spip_transcriptome" not in nf_params 6679 and "spliceai_transcriptome" not in nf_params 6680 ): 6681 splice_reference = splice_annotations(options, config) 6682 if splice_reference: 6683 nf_params.extend(splice_reference) 6684 # nf_params.append(f"--output_folder {output_folder}") 6685 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6686 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6687 log.debug(cmd) 6688 splice_config["docker"]["command"] = cmd 6689 6690 # Ensure proxy is set 6691 proxy = [ 6692 f"-e {var}={os.getenv(var)}" 6693 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6694 if os.getenv(var) is not None 6695 ] 6696 docker_cmd = get_bin_command( 6697 tool="splice", 6698 bin_type="docker", 6699 config=config, 6700 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6701 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6702 ) 6703 # print(docker_cmd) 6704 # exit() 6705 # Docker debug 6706 # if splice_config.get("rm_container"): 6707 # rm_container = "--rm" 6708 # else: 6709 # rm_container = "" 6710 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6711 log.debug(docker_cmd) 6712 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6713 log.debug(res.stdout) 6714 if res.stderr: 6715 log.error(res.stderr) 6716 res.check_returncode() 6717 # Update variants 6718 log.info("Annotation - Updating...") 6719 # Test find output vcf 6720 log.debug( 6721 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6722 ) 6723 output_vcf = [] 6724 # Wrong folder to look in 6725 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6726 if ( 6727 files 6728 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6729 ): 6730 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6731 # log.debug(os.listdir(options.get("output_folder"))) 6732 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6733 if not output_vcf: 6734 log.debug( 6735 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6736 ) 6737 else: 6738 # Get new header from annotated vcf 6739 log.debug(f"Initial header: {len(header.infos)} fields") 6740 # Create new header with splice infos 6741 new_vcf = Variants(input=output_vcf[0]) 6742 new_vcf_header = new_vcf.get_header().infos 6743 for keys, infos in new_vcf_header.items(): 6744 if keys not in header.infos.keys(): 6745 header.infos[keys] = infos 6746 log.debug(f"New header: {len(header.infos)} fields") 6747 log.debug(f"Splice tmp output: {output_vcf[0]}") 6748 self.update_from_vcf(output_vcf[0]) 6749 6750 # Remove file 6751 remove_if_exists(output_vcf) 6752 6753 ### 6754 # Prioritization 6755 ### 6756 6757 def get_config_default(self, name: str) -> dict: 6758 """ 6759 The function `get_config_default` returns a dictionary containing default configurations for 6760 various calculations and prioritizations. 6761 6762 :param name: The `get_config_default` function returns a dictionary containing default 6763 configurations for different calculations and prioritizations. The `name` parameter is used to 6764 specify which specific configuration to retrieve from the dictionary 6765 :type name: str 6766 :return: The function `get_config_default` returns a dictionary containing default configuration 6767 settings for different calculations and prioritizations. The specific configuration settings are 6768 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6769 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6770 returned. If there is no match, an empty dictionary is returned. 6771 """ 6772 6773 config_default = { 6774 "calculations": { 6775 "variant_chr_pos_alt_ref": { 6776 "type": "sql", 6777 "name": "variant_chr_pos_alt_ref", 6778 "description": "Create a variant ID with chromosome, position, alt and ref", 6779 "available": False, 6780 "output_column_name": "variant_chr_pos_alt_ref", 6781 "output_column_type": "String", 6782 "output_column_description": "variant ID with chromosome, position, alt and ref", 6783 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6784 "operation_info": True, 6785 }, 6786 "VARTYPE": { 6787 "type": "sql", 6788 "name": "VARTYPE", 6789 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6790 "available": True, 6791 "table": "variants", 6792 "output_column_name": "VARTYPE", 6793 "output_column_type": "String", 6794 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6795 "operation_query": """ 6796 CASE 6797 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6798 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6799 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6800 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6801 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6802 ELSE 'UNDEFINED' 6803 END 6804 """, 6805 "info_fields": ["SVTYPE"], 6806 "operation_info": True, 6807 }, 6808 "snpeff_hgvs": { 6809 "type": "python", 6810 "name": "snpeff_hgvs", 6811 "description": "HGVS nomenclatures from snpEff annotation", 6812 "available": True, 6813 "function_name": "calculation_extract_snpeff_hgvs", 6814 "function_params": ["snpeff_hgvs", "ANN"], 6815 }, 6816 "snpeff_ann_explode": { 6817 "type": "python", 6818 "name": "snpeff_ann_explode", 6819 "description": "Explode snpEff annotations with uniquify values", 6820 "available": True, 6821 "function_name": "calculation_snpeff_ann_explode", 6822 "function_params": [False, "fields", "snpeff_", "ANN"], 6823 }, 6824 "snpeff_ann_explode_uniquify": { 6825 "type": "python", 6826 "name": "snpeff_ann_explode_uniquify", 6827 "description": "Explode snpEff annotations", 6828 "available": True, 6829 "function_name": "calculation_snpeff_ann_explode", 6830 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6831 }, 6832 "snpeff_ann_explode_json": { 6833 "type": "python", 6834 "name": "snpeff_ann_explode_json", 6835 "description": "Explode snpEff annotations in JSON format", 6836 "available": True, 6837 "function_name": "calculation_snpeff_ann_explode", 6838 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6839 }, 6840 "NOMEN": { 6841 "type": "python", 6842 "name": "NOMEN", 6843 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6844 "available": True, 6845 "function_name": "calculation_extract_nomen", 6846 "function_params": [], 6847 }, 6848 "RENAME_INFO_FIELDS": { 6849 "type": "python", 6850 "name": "RENAME_INFO_FIELDS", 6851 "description": "Rename or remove INFO/tags", 6852 "available": True, 6853 "function_name": "calculation_rename_info_fields", 6854 "function_params": [], 6855 }, 6856 "FINDBYPIPELINE": { 6857 "type": "python", 6858 "name": "FINDBYPIPELINE", 6859 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6860 "available": True, 6861 "function_name": "calculation_find_by_pipeline", 6862 "function_params": ["findbypipeline"], 6863 }, 6864 "FINDBYSAMPLE": { 6865 "type": "python", 6866 "name": "FINDBYSAMPLE", 6867 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6868 "available": True, 6869 "function_name": "calculation_find_by_pipeline", 6870 "function_params": ["findbysample"], 6871 }, 6872 "GENOTYPECONCORDANCE": { 6873 "type": "python", 6874 "name": "GENOTYPECONCORDANCE", 6875 "description": "Concordance of genotype for multi caller VCF", 6876 "available": True, 6877 "function_name": "calculation_genotype_concordance", 6878 "function_params": [], 6879 }, 6880 "BARCODE": { 6881 "type": "python", 6882 "name": "BARCODE", 6883 "description": "BARCODE as VaRank tool", 6884 "available": True, 6885 "function_name": "calculation_barcode", 6886 "function_params": [], 6887 }, 6888 "BARCODEFAMILY": { 6889 "type": "python", 6890 "name": "BARCODEFAMILY", 6891 "description": "BARCODEFAMILY as VaRank tool", 6892 "available": True, 6893 "function_name": "calculation_barcode_family", 6894 "function_params": ["BCF"], 6895 }, 6896 "TRIO": { 6897 "type": "python", 6898 "name": "TRIO", 6899 "description": "Inheritance for a trio family", 6900 "available": True, 6901 "function_name": "calculation_trio", 6902 "function_params": [], 6903 }, 6904 "VAF": { 6905 "type": "python", 6906 "name": "VAF", 6907 "description": "Variant Allele Frequency (VAF) harmonization", 6908 "available": True, 6909 "function_name": "calculation_vaf_normalization", 6910 "function_params": [], 6911 }, 6912 "VAF_stats": { 6913 "type": "python", 6914 "name": "VAF_stats", 6915 "description": "Variant Allele Frequency (VAF) statistics", 6916 "available": True, 6917 "function_name": "calculation_genotype_stats", 6918 "function_params": ["VAF"], 6919 }, 6920 "DP_stats": { 6921 "type": "python", 6922 "name": "DP_stats", 6923 "description": "Depth (DP) statistics", 6924 "available": True, 6925 "function_name": "calculation_genotype_stats", 6926 "function_params": ["DP"], 6927 }, 6928 "variant_id": { 6929 "type": "python", 6930 "name": "variant_id", 6931 "description": "Variant ID generated from variant position and type", 6932 "available": True, 6933 "function_name": "calculation_variant_id", 6934 "function_params": [], 6935 }, 6936 "transcripts_json": { 6937 "type": "python", 6938 "name": "transcripts_json", 6939 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6940 "available": True, 6941 "function_name": "calculation_transcripts_annotation", 6942 "function_params": ["transcripts_json", None], 6943 }, 6944 "transcripts_ann": { 6945 "type": "python", 6946 "name": "transcripts_ann", 6947 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6948 "available": True, 6949 "function_name": "calculation_transcripts_annotation", 6950 "function_params": [None, "transcripts_ann"], 6951 }, 6952 "transcripts_annotations": { 6953 "type": "python", 6954 "name": "transcripts_annotations", 6955 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6956 "available": True, 6957 "function_name": "calculation_transcripts_annotation", 6958 "function_params": [None, None], 6959 }, 6960 "transcripts_prioritization": { 6961 "type": "python", 6962 "name": "transcripts_prioritization", 6963 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6964 "available": True, 6965 "function_name": "calculation_transcripts_prioritization", 6966 "function_params": [], 6967 }, 6968 "transcripts_export": { 6969 "type": "python", 6970 "name": "transcripts_export", 6971 "description": "Export transcripts table/view as a file (using param.json)", 6972 "available": True, 6973 "function_name": "calculation_transcripts_export", 6974 "function_params": [], 6975 }, 6976 }, 6977 "prioritizations": { 6978 "default": { 6979 "ANN2": [ 6980 { 6981 "type": "contains", 6982 "value": "HIGH", 6983 "score": 5, 6984 "flag": "PASS", 6985 "comment": [ 6986 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6987 ], 6988 }, 6989 { 6990 "type": "contains", 6991 "value": "MODERATE", 6992 "score": 3, 6993 "flag": "PASS", 6994 "comment": [ 6995 "A non-disruptive variant that might change protein effectiveness" 6996 ], 6997 }, 6998 { 6999 "type": "contains", 7000 "value": "LOW", 7001 "score": 0, 7002 "flag": "FILTERED", 7003 "comment": [ 7004 "Assumed to be mostly harmless or unlikely to change protein behavior" 7005 ], 7006 }, 7007 { 7008 "type": "contains", 7009 "value": "MODIFIER", 7010 "score": 0, 7011 "flag": "FILTERED", 7012 "comment": [ 7013 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7014 ], 7015 }, 7016 ], 7017 } 7018 }, 7019 } 7020 7021 return config_default.get(name, None) 7022 7023 def get_config_json( 7024 self, name: str, config_dict: dict = {}, config_file: str = None 7025 ) -> dict: 7026 """ 7027 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7028 default values, a dictionary, and a file. 7029 7030 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7031 the name of the configuration. It is used to identify and retrieve the configuration settings 7032 for a specific component or module 7033 :type name: str 7034 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7035 dictionary that allows you to provide additional configuration settings or overrides. When you 7036 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7037 the key is the configuration setting you want to override or 7038 :type config_dict: dict 7039 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7040 specify the path to a configuration file that contains additional settings. If provided, the 7041 function will read the contents of this file and update the configuration dictionary with the 7042 values found in the file, overriding any existing values with the 7043 :type config_file: str 7044 :return: The function `get_config_json` returns a dictionary containing the configuration 7045 settings. 7046 """ 7047 7048 # Create with default prioritizations 7049 config_default = self.get_config_default(name=name) 7050 configuration = config_default 7051 # log.debug(f"configuration={configuration}") 7052 7053 # Replace prioritizations from dict 7054 for config in config_dict: 7055 configuration[config] = config_dict[config] 7056 7057 # Replace prioritizations from file 7058 config_file = full_path(config_file) 7059 if config_file: 7060 if os.path.exists(config_file): 7061 with open(config_file) as config_file_content: 7062 config_file_dict = yaml.safe_load(config_file_content) 7063 for config in config_file_dict: 7064 configuration[config] = config_file_dict[config] 7065 else: 7066 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7067 log.error(msg_error) 7068 raise ValueError(msg_error) 7069 7070 return configuration 7071 7072 def prioritization( 7073 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7074 ) -> bool: 7075 """ 7076 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7077 prioritizes variants based on configured profiles and criteria. 7078 7079 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7080 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7081 a table name is provided, the method will prioritize the variants in that specific table 7082 :type table: str 7083 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7084 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7085 provided, the code will use a default prefix value of "PZ" 7086 :type pz_prefix: str 7087 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7088 additional parameters specific to the prioritization process. These parameters can include 7089 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7090 configurations needed for the prioritization of variants in a V 7091 :type pz_param: dict 7092 :return: A boolean value (True) is being returned from the `prioritization` function. 7093 """ 7094 7095 # Config 7096 config = self.get_config() 7097 7098 # Param 7099 param = self.get_param() 7100 7101 # Prioritization param 7102 if pz_param is not None: 7103 prioritization_param = pz_param 7104 else: 7105 prioritization_param = param.get("prioritization", {}) 7106 7107 # Configuration profiles 7108 prioritization_config_file = prioritization_param.get( 7109 "prioritization_config", None 7110 ) 7111 prioritization_config_file = full_path(prioritization_config_file) 7112 prioritizations_config = self.get_config_json( 7113 name="prioritizations", config_file=prioritization_config_file 7114 ) 7115 7116 # Prioritization prefix 7117 pz_prefix_default = "PZ" 7118 if pz_prefix is None: 7119 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7120 7121 # Prioritization options 7122 profiles = prioritization_param.get("profiles", []) 7123 if isinstance(profiles, str): 7124 profiles = profiles.split(",") 7125 pzfields = prioritization_param.get( 7126 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7127 ) 7128 if isinstance(pzfields, str): 7129 pzfields = pzfields.split(",") 7130 default_profile = prioritization_param.get("default_profile", None) 7131 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7132 prioritization_score_mode = prioritization_param.get( 7133 "prioritization_score_mode", "HOWARD" 7134 ) 7135 7136 # Quick Prioritizations 7137 prioritizations = param.get("prioritizations", None) 7138 if prioritizations: 7139 log.info("Quick Prioritization:") 7140 for profile in prioritizations.split(","): 7141 if profile not in profiles: 7142 profiles.append(profile) 7143 log.info(f" {profile}") 7144 7145 # If profile "ALL" provided, all profiles in the config profiles 7146 if "ALL" in profiles: 7147 profiles = list(prioritizations_config.keys()) 7148 7149 for profile in profiles: 7150 if prioritizations_config.get(profile, None): 7151 log.debug(f"Profile '{profile}' configured") 7152 else: 7153 msg_error = f"Profile '{profile}' NOT configured" 7154 log.error(msg_error) 7155 raise ValueError(msg_error) 7156 7157 if profiles: 7158 log.info(f"Prioritization... ") 7159 else: 7160 log.debug(f"No profile defined") 7161 return False 7162 7163 if not default_profile and len(profiles): 7164 default_profile = profiles[0] 7165 7166 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7167 log.debug("Profiles to check: " + str(list(profiles))) 7168 7169 # Variables 7170 if table is not None: 7171 table_variants = table 7172 else: 7173 table_variants = self.get_table_variants(clause="update") 7174 log.debug(f"Table to prioritize: {table_variants}") 7175 7176 # Added columns 7177 added_columns = [] 7178 7179 # Create list of PZfields 7180 # List of PZFields 7181 list_of_pzfields_original = pzfields + [ 7182 pzfield + pzfields_sep + profile 7183 for pzfield in pzfields 7184 for profile in profiles 7185 ] 7186 list_of_pzfields = [] 7187 log.debug(f"{list_of_pzfields_original}") 7188 7189 # Remove existing PZfields to use if exists 7190 for pzfield in list_of_pzfields_original: 7191 if self.get_header().infos.get(pzfield, None) is None: 7192 list_of_pzfields.append(pzfield) 7193 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7194 else: 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7196 7197 if list_of_pzfields: 7198 7199 # Explode Infos prefix 7200 explode_infos_prefix = self.get_explode_infos_prefix() 7201 7202 # PZfields tags description 7203 PZfields_INFOS = { 7204 f"{pz_prefix}Tags": { 7205 "ID": f"{pz_prefix}Tags", 7206 "Number": ".", 7207 "Type": "String", 7208 "Description": "Variant tags based on annotation criteria", 7209 }, 7210 f"{pz_prefix}Score": { 7211 "ID": f"{pz_prefix}Score", 7212 "Number": 1, 7213 "Type": "Integer", 7214 "Description": "Variant score based on annotation criteria", 7215 }, 7216 f"{pz_prefix}Flag": { 7217 "ID": f"{pz_prefix}Flag", 7218 "Number": 1, 7219 "Type": "String", 7220 "Description": "Variant flag based on annotation criteria", 7221 }, 7222 f"{pz_prefix}Comment": { 7223 "ID": f"{pz_prefix}Comment", 7224 "Number": ".", 7225 "Type": "String", 7226 "Description": "Variant comment based on annotation criteria", 7227 }, 7228 f"{pz_prefix}Infos": { 7229 "ID": f"{pz_prefix}Infos", 7230 "Number": ".", 7231 "Type": "String", 7232 "Description": "Variant infos based on annotation criteria", 7233 }, 7234 f"{pz_prefix}Class": { 7235 "ID": f"{pz_prefix}Class", 7236 "Number": ".", 7237 "Type": "String", 7238 "Description": "Variant class based on annotation criteria", 7239 }, 7240 } 7241 7242 # Create INFO fields if not exist 7243 for field in PZfields_INFOS: 7244 field_ID = PZfields_INFOS[field]["ID"] 7245 field_description = PZfields_INFOS[field]["Description"] 7246 if field_ID not in self.get_header().infos and field_ID in pzfields: 7247 field_description = ( 7248 PZfields_INFOS[field]["Description"] 7249 + f", profile {default_profile}" 7250 ) 7251 self.get_header().infos[field_ID] = vcf.parser._Info( 7252 field_ID, 7253 PZfields_INFOS[field]["Number"], 7254 PZfields_INFOS[field]["Type"], 7255 field_description, 7256 "unknown", 7257 "unknown", 7258 code_type_map[PZfields_INFOS[field]["Type"]], 7259 ) 7260 7261 # Create INFO fields if not exist for each profile 7262 for profile in prioritizations_config: 7263 if profile in profiles or profiles == []: 7264 for field in PZfields_INFOS: 7265 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7266 field_description = ( 7267 PZfields_INFOS[field]["Description"] 7268 + f", profile {profile}" 7269 ) 7270 if ( 7271 field_ID not in self.get_header().infos 7272 and field in pzfields 7273 ): 7274 self.get_header().infos[field_ID] = vcf.parser._Info( 7275 field_ID, 7276 PZfields_INFOS[field]["Number"], 7277 PZfields_INFOS[field]["Type"], 7278 field_description, 7279 "unknown", 7280 "unknown", 7281 code_type_map[PZfields_INFOS[field]["Type"]], 7282 ) 7283 7284 # Header 7285 for pzfield in list_of_pzfields: 7286 if re.match(f"{pz_prefix}Score.*", pzfield): 7287 added_column = self.add_column( 7288 table_name=table_variants, 7289 column_name=pzfield, 7290 column_type="INTEGER", 7291 default_value="0", 7292 ) 7293 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7294 added_column = self.add_column( 7295 table_name=table_variants, 7296 column_name=pzfield, 7297 column_type="BOOLEAN", 7298 default_value="1", 7299 ) 7300 elif re.match(f"{pz_prefix}Class.*", pzfield): 7301 added_column = self.add_column( 7302 table_name=table_variants, 7303 column_name=pzfield, 7304 column_type="VARCHAR[]", 7305 default_value="null", 7306 ) 7307 else: 7308 added_column = self.add_column( 7309 table_name=table_variants, 7310 column_name=pzfield, 7311 column_type="STRING", 7312 default_value="''", 7313 ) 7314 added_columns.append(added_column) 7315 7316 # Profiles 7317 if profiles: 7318 7319 # foreach profile in configuration file 7320 for profile in prioritizations_config: 7321 7322 # If profile is asked in param, or ALL are asked (empty profile []) 7323 if profile in profiles or profiles == []: 7324 log.info(f"Profile '{profile}'") 7325 7326 sql_set_info_option = "" 7327 7328 sql_set_info = [] 7329 7330 # PZ fields set 7331 7332 # PZScore 7333 if ( 7334 f"{pz_prefix}Score{pzfields_sep}{profile}" 7335 in list_of_pzfields 7336 ): 7337 sql_set_info.append( 7338 f""" 7339 concat( 7340 '{pz_prefix}Score{pzfields_sep}{profile}=', 7341 {pz_prefix}Score{pzfields_sep}{profile} 7342 ) 7343 """ 7344 ) 7345 if ( 7346 profile == default_profile 7347 and f"{pz_prefix}Score" in list_of_pzfields 7348 ): 7349 sql_set_info.append( 7350 f""" 7351 concat( 7352 '{pz_prefix}Score=', 7353 {pz_prefix}Score{pzfields_sep}{profile} 7354 ) 7355 """ 7356 ) 7357 7358 # PZFlag 7359 if ( 7360 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7361 in list_of_pzfields 7362 ): 7363 sql_set_info.append( 7364 f""" 7365 concat( 7366 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7367 CASE 7368 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7369 THEN 'PASS' 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7371 THEN 'FILTERED' 7372 END 7373 ) 7374 """ 7375 ) 7376 if ( 7377 profile == default_profile 7378 and f"{pz_prefix}Flag" in list_of_pzfields 7379 ): 7380 sql_set_info.append( 7381 f""" 7382 concat( 7383 '{pz_prefix}Flag=', 7384 CASE 7385 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7386 THEN 'PASS' 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7388 THEN 'FILTERED' 7389 END 7390 ) 7391 """ 7392 ) 7393 7394 # PZClass 7395 if ( 7396 f"{pz_prefix}Class{pzfields_sep}{profile}" 7397 in list_of_pzfields 7398 ): 7399 sql_set_info.append( 7400 f""" 7401 concat( 7402 '{pz_prefix}Class{pzfields_sep}{profile}=', 7403 CASE 7404 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7405 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7406 ELSE '.' 7407 END 7408 ) 7409 7410 """ 7411 ) 7412 if ( 7413 profile == default_profile 7414 and f"{pz_prefix}Class" in list_of_pzfields 7415 ): 7416 sql_set_info.append( 7417 f""" 7418 concat( 7419 '{pz_prefix}Class=', 7420 CASE 7421 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7422 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7423 ELSE '.' 7424 END 7425 ) 7426 """ 7427 ) 7428 7429 # PZComment 7430 if ( 7431 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7432 in list_of_pzfields 7433 ): 7434 sql_set_info.append( 7435 f""" 7436 CASE 7437 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7438 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7439 ELSE '' 7440 END 7441 """ 7442 ) 7443 if ( 7444 profile == default_profile 7445 and f"{pz_prefix}Comment" in list_of_pzfields 7446 ): 7447 sql_set_info.append( 7448 f""" 7449 CASE 7450 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7451 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7452 ELSE '' 7453 END 7454 """ 7455 ) 7456 7457 # PZInfos 7458 if ( 7459 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7460 in list_of_pzfields 7461 ): 7462 sql_set_info.append( 7463 f""" 7464 CASE 7465 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7466 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7467 ELSE '' 7468 END 7469 """ 7470 ) 7471 if ( 7472 profile == default_profile 7473 and f"{pz_prefix}Infos" in list_of_pzfields 7474 ): 7475 sql_set_info.append( 7476 f""" 7477 CASE 7478 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7479 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7480 ELSE '' 7481 END 7482 """ 7483 ) 7484 7485 # Merge PZfields 7486 sql_set_info_option = "" 7487 sql_set_sep = "" 7488 for sql_set in sql_set_info: 7489 if sql_set_sep: 7490 sql_set_info_option += f""" 7491 , concat('{sql_set_sep}', {sql_set}) 7492 """ 7493 else: 7494 sql_set_info_option += f""" 7495 , {sql_set} 7496 """ 7497 sql_set_sep = ";" 7498 7499 sql_queries = [] 7500 for annotation in prioritizations_config[profile]: 7501 7502 # skip special sections 7503 if annotation.startswith("_"): 7504 continue 7505 7506 # For each criterions 7507 for criterion in prioritizations_config[profile][ 7508 annotation 7509 ]: 7510 7511 # Criterion mode 7512 criterion_mode = None 7513 if np.any( 7514 np.isin(list(criterion.keys()), ["type", "value"]) 7515 ): 7516 criterion_mode = "operation" 7517 elif np.any( 7518 np.isin(list(criterion.keys()), ["sql", "fields"]) 7519 ): 7520 criterion_mode = "sql" 7521 log.debug(f"Criterion Mode: {criterion_mode}") 7522 7523 # Criterion parameters 7524 criterion_type = criterion.get("type", None) 7525 criterion_value = criterion.get("value", None) 7526 criterion_sql = criterion.get("sql", None) 7527 criterion_fields = criterion.get("fields", None) 7528 criterion_score = criterion.get("score", 0) 7529 criterion_flag = criterion.get("flag", "PASS") 7530 criterion_class = criterion.get("class", None) 7531 criterion_flag_bool = criterion_flag == "PASS" 7532 criterion_comment = ( 7533 ", ".join(criterion.get("comment", [])) 7534 .replace("'", "''") 7535 .replace(";", ",") 7536 .replace("\t", " ") 7537 ) 7538 criterion_infos = ( 7539 str(criterion) 7540 .replace("'", "''") 7541 .replace(";", ",") 7542 .replace("\t", " ") 7543 ) 7544 7545 # SQL 7546 if criterion_sql is not None and isinstance( 7547 criterion_sql, list 7548 ): 7549 criterion_sql = " ".join(criterion_sql) 7550 7551 # Fields and explode 7552 if criterion_fields is None: 7553 criterion_fields = [annotation] 7554 if not isinstance(criterion_fields, list): 7555 criterion_fields = str(criterion_fields).split(",") 7556 7557 # Class 7558 if criterion_class is not None and not isinstance( 7559 criterion_class, list 7560 ): 7561 criterion_class = str(criterion_class).split(",") 7562 7563 for annotation_field in criterion_fields: 7564 7565 # Explode specific annotation 7566 log.debug( 7567 f"Explode annotation '{annotation_field}'" 7568 ) 7569 added_columns += self.explode_infos( 7570 prefix=explode_infos_prefix, 7571 fields=[annotation_field], 7572 table=table_variants, 7573 ) 7574 extra_infos = self.get_extra_infos( 7575 table=table_variants 7576 ) 7577 7578 # Check if annotation field is present 7579 if ( 7580 f"{explode_infos_prefix}{annotation_field}" 7581 not in extra_infos 7582 ): 7583 msq_err = f"Annotation '{annotation_field}' not in data" 7584 log.error(msq_err) 7585 raise ValueError(msq_err) 7586 else: 7587 log.debug( 7588 f"Annotation '{annotation_field}' in data" 7589 ) 7590 7591 sql_set = [] 7592 sql_set_info = [] 7593 7594 # PZ fields set 7595 7596 # PZScore 7597 if ( 7598 f"{pz_prefix}Score{pzfields_sep}{profile}" 7599 in list_of_pzfields 7600 ): 7601 # VaRank prioritization score mode 7602 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7603 sql_set.append( 7604 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7605 ) 7606 # default HOWARD prioritization score mode 7607 else: 7608 sql_set.append( 7609 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7610 ) 7611 7612 # PZFlag 7613 if ( 7614 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7615 in list_of_pzfields 7616 ): 7617 sql_set.append( 7618 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7619 ) 7620 7621 # PZClass 7622 if ( 7623 f"{pz_prefix}Class{pzfields_sep}{profile}" 7624 in list_of_pzfields 7625 and criterion_class is not None 7626 ): 7627 sql_set.append( 7628 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7629 ) 7630 7631 # PZComment 7632 if ( 7633 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7634 in list_of_pzfields 7635 ): 7636 sql_set.append( 7637 f""" 7638 {pz_prefix}Comment{pzfields_sep}{profile} = 7639 concat( 7640 {pz_prefix}Comment{pzfields_sep}{profile}, 7641 CASE 7642 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7643 THEN ', ' 7644 ELSE '' 7645 END, 7646 '{criterion_comment}' 7647 ) 7648 """ 7649 ) 7650 7651 # PZInfos 7652 if ( 7653 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7654 in list_of_pzfields 7655 ): 7656 sql_set.append( 7657 f""" 7658 {pz_prefix}Infos{pzfields_sep}{profile} = 7659 concat( 7660 {pz_prefix}Infos{pzfields_sep}{profile}, 7661 '{criterion_infos}' 7662 ) 7663 """ 7664 ) 7665 sql_set_option = ",".join(sql_set) 7666 7667 # Criterion and comparison 7668 if sql_set_option: 7669 7670 if criterion_mode in ["operation"]: 7671 7672 try: 7673 float(criterion_value) 7674 sql_update = f""" 7675 UPDATE {table_variants} 7676 SET {sql_set_option} 7677 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7678 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7679 """ 7680 except: 7681 contains_option = "" 7682 if criterion_type == "contains": 7683 contains_option = ".*" 7684 sql_update = f""" 7685 UPDATE {table_variants} 7686 SET {sql_set_option} 7687 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7688 """ 7689 sql_queries.append(sql_update) 7690 7691 elif criterion_mode in ["sql"]: 7692 7693 sql_update = f""" 7694 UPDATE {table_variants} 7695 SET {sql_set_option} 7696 WHERE {criterion_sql} 7697 """ 7698 sql_queries.append(sql_update) 7699 7700 else: 7701 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7702 log.error(msg_err) 7703 raise ValueError(msg_err) 7704 7705 else: 7706 log.warning( 7707 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7708 ) 7709 7710 # PZTags 7711 if ( 7712 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7713 in list_of_pzfields 7714 ): 7715 7716 # Create PZFalgs value 7717 pztags_value = "" 7718 pztags_sep_default = "," 7719 pztags_sep = "" 7720 for pzfield in pzfields: 7721 if pzfield not in [f"{pz_prefix}Tags"]: 7722 if ( 7723 f"{pzfield}{pzfields_sep}{profile}" 7724 in list_of_pzfields 7725 ): 7726 if pzfield in [f"{pz_prefix}Flag"]: 7727 pztags_value += f"""{pztags_sep}{pzfield}#', 7728 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7729 THEN 'PASS' 7730 ELSE 'FILTERED' 7731 END, '""" 7732 elif pzfield in [f"{pz_prefix}Class"]: 7733 pztags_value += f"""{pztags_sep}{pzfield}#', 7734 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7735 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7736 ELSE '.' 7737 END, '""" 7738 else: 7739 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7740 pztags_sep = pztags_sep_default 7741 7742 # Add Query update for PZFlags 7743 sql_update_pztags = f""" 7744 UPDATE {table_variants} 7745 SET INFO = concat( 7746 INFO, 7747 CASE WHEN INFO NOT in ('','.') 7748 THEN ';' 7749 ELSE '' 7750 END, 7751 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7752 ) 7753 """ 7754 sql_queries.append(sql_update_pztags) 7755 7756 # Add Query update for PZFlags for default 7757 if profile == default_profile: 7758 sql_update_pztags_default = f""" 7759 UPDATE {table_variants} 7760 SET INFO = concat( 7761 INFO, 7762 ';', 7763 '{pz_prefix}Tags={pztags_value}' 7764 ) 7765 """ 7766 sql_queries.append(sql_update_pztags_default) 7767 7768 log.info(f"""Profile '{profile}' - Prioritization... """) 7769 7770 if sql_queries: 7771 7772 for sql_query in sql_queries: 7773 log.debug( 7774 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7775 ) 7776 self.conn.execute(sql_query) 7777 7778 log.info(f"""Profile '{profile}' - Update... """) 7779 sql_query_update = f""" 7780 UPDATE {table_variants} 7781 SET INFO = 7782 concat( 7783 CASE 7784 WHEN INFO NOT IN ('','.') 7785 THEN concat(INFO, ';') 7786 ELSE '' 7787 END 7788 {sql_set_info_option} 7789 ) 7790 """ 7791 self.conn.execute(sql_query_update) 7792 7793 else: 7794 7795 log.warning(f"No profiles in parameters") 7796 7797 # Remove added columns 7798 for added_column in added_columns: 7799 self.drop_column(column=added_column) 7800 7801 # Explode INFOS fields into table fields 7802 if self.get_explode_infos(): 7803 self.explode_infos( 7804 prefix=self.get_explode_infos_prefix(), 7805 fields=self.get_explode_infos_fields(), 7806 force=True, 7807 ) 7808 7809 return True 7810 7811 ### 7812 # HGVS 7813 ### 7814 7815 def annotation_hgvs(self, threads: int = None) -> None: 7816 """ 7817 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7818 coordinates and alleles. 7819 7820 :param threads: The `threads` parameter is an optional integer that specifies the number of 7821 threads to use for parallel processing. If no value is provided, it will default to the number 7822 of threads obtained from the `get_threads()` method 7823 :type threads: int 7824 """ 7825 7826 # Function for each partition of the Dask Dataframe 7827 def partition_function(partition): 7828 """ 7829 The function `partition_function` applies the `annotation_hgvs_partition` function to 7830 each row of a DataFrame called `partition`. 7831 7832 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7833 to be processed 7834 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7835 the "partition" dataframe along the axis 1. 7836 """ 7837 return partition.apply(annotation_hgvs_partition, axis=1) 7838 7839 def annotation_hgvs_partition(row) -> str: 7840 """ 7841 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7842 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7843 7844 :param row: A dictionary-like object that contains the values for the following keys: 7845 :return: a string that contains the HGVS names associated with the given row of data. 7846 """ 7847 7848 chr = row["CHROM"] 7849 pos = row["POS"] 7850 ref = row["REF"] 7851 alt = row["ALT"] 7852 7853 # Find list of associated transcripts 7854 transcripts_list = list( 7855 polars_conn.execute( 7856 f""" 7857 SELECT transcript 7858 FROM refseq_df 7859 WHERE CHROM='{chr}' 7860 AND POS={pos} 7861 """ 7862 )["transcript"] 7863 ) 7864 7865 # Full HGVS annotation in list 7866 hgvs_full_list = [] 7867 7868 for transcript_name in transcripts_list: 7869 7870 # Transcript 7871 transcript = get_transcript( 7872 transcripts=transcripts, transcript_name=transcript_name 7873 ) 7874 # Exon 7875 if use_exon: 7876 exon = transcript.find_exon_number(pos) 7877 else: 7878 exon = None 7879 # Protein 7880 transcript_protein = None 7881 if use_protein or add_protein or full_format: 7882 transcripts_protein = list( 7883 polars_conn.execute( 7884 f""" 7885 SELECT protein 7886 FROM refseqlink_df 7887 WHERE transcript='{transcript_name}' 7888 LIMIT 1 7889 """ 7890 )["protein"] 7891 ) 7892 if len(transcripts_protein): 7893 transcript_protein = transcripts_protein[0] 7894 7895 # HGVS name 7896 hgvs_name = format_hgvs_name( 7897 chr, 7898 pos, 7899 ref, 7900 alt, 7901 genome=genome, 7902 transcript=transcript, 7903 transcript_protein=transcript_protein, 7904 exon=exon, 7905 use_gene=use_gene, 7906 use_protein=use_protein, 7907 full_format=full_format, 7908 use_version=use_version, 7909 codon_type=codon_type, 7910 ) 7911 hgvs_full_list.append(hgvs_name) 7912 if add_protein and not use_protein and not full_format: 7913 hgvs_name = format_hgvs_name( 7914 chr, 7915 pos, 7916 ref, 7917 alt, 7918 genome=genome, 7919 transcript=transcript, 7920 transcript_protein=transcript_protein, 7921 exon=exon, 7922 use_gene=use_gene, 7923 use_protein=True, 7924 full_format=False, 7925 use_version=use_version, 7926 codon_type=codon_type, 7927 ) 7928 hgvs_full_list.append(hgvs_name) 7929 7930 # Create liste of HGVS annotations 7931 hgvs_full = ",".join(hgvs_full_list) 7932 7933 return hgvs_full 7934 7935 # Polars connexion 7936 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7937 7938 # Config 7939 config = self.get_config() 7940 7941 # Databases 7942 # Genome 7943 databases_genomes_folders = ( 7944 config.get("folders", {}) 7945 .get("databases", {}) 7946 .get("genomes", DEFAULT_GENOME_FOLDER) 7947 ) 7948 databases_genome = ( 7949 config.get("folders", {}).get("databases", {}).get("genomes", "") 7950 ) 7951 # refseq database folder 7952 databases_refseq_folders = ( 7953 config.get("folders", {}) 7954 .get("databases", {}) 7955 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7956 ) 7957 # refseq 7958 databases_refseq = config.get("databases", {}).get("refSeq", None) 7959 # refSeqLink 7960 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7961 7962 # Param 7963 param = self.get_param() 7964 7965 # Quick HGVS 7966 if "hgvs_options" in param and param.get("hgvs_options", ""): 7967 log.info(f"Quick HGVS Annotation:") 7968 if not param.get("hgvs", None): 7969 param["hgvs"] = {} 7970 for option in param.get("hgvs_options", "").split(","): 7971 option_var_val = option.split("=") 7972 option_var = option_var_val[0] 7973 if len(option_var_val) > 1: 7974 option_val = option_var_val[1] 7975 else: 7976 option_val = "True" 7977 if option_val.upper() in ["TRUE"]: 7978 option_val = True 7979 elif option_val.upper() in ["FALSE"]: 7980 option_val = False 7981 log.info(f" {option_var}={option_val}") 7982 param["hgvs"][option_var] = option_val 7983 7984 # Check if HGVS annotation enabled 7985 if "hgvs" in param: 7986 log.info(f"HGVS Annotation... ") 7987 for hgvs_option in param.get("hgvs", {}): 7988 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7989 else: 7990 return 7991 7992 # HGVS Param 7993 param_hgvs = param.get("hgvs", {}) 7994 use_exon = param_hgvs.get("use_exon", False) 7995 use_gene = param_hgvs.get("use_gene", False) 7996 use_protein = param_hgvs.get("use_protein", False) 7997 add_protein = param_hgvs.get("add_protein", False) 7998 full_format = param_hgvs.get("full_format", False) 7999 use_version = param_hgvs.get("use_version", False) 8000 codon_type = param_hgvs.get("codon_type", "3") 8001 8002 # refSseq refSeqLink 8003 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8004 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8005 8006 # Assembly 8007 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8008 8009 # Genome 8010 genome_file = None 8011 if find_genome(databases_genome): 8012 genome_file = find_genome(databases_genome) 8013 else: 8014 genome_file = find_genome( 8015 genome_path=databases_genomes_folders, assembly=assembly 8016 ) 8017 log.debug("Genome: " + str(genome_file)) 8018 8019 # refSseq 8020 refseq_file = find_file_prefix( 8021 input_file=databases_refseq, 8022 prefix="ncbiRefSeq", 8023 folder=databases_refseq_folders, 8024 assembly=assembly, 8025 ) 8026 log.debug("refSeq: " + str(refseq_file)) 8027 8028 # refSeqLink 8029 refseqlink_file = find_file_prefix( 8030 input_file=databases_refseqlink, 8031 prefix="ncbiRefSeqLink", 8032 folder=databases_refseq_folders, 8033 assembly=assembly, 8034 ) 8035 log.debug("refSeqLink: " + str(refseqlink_file)) 8036 8037 # Threads 8038 if not threads: 8039 threads = self.get_threads() 8040 log.debug("Threads: " + str(threads)) 8041 8042 # Variables 8043 table_variants = self.get_table_variants(clause="update") 8044 8045 # Get variants SNV and InDel only 8046 query_variants = f""" 8047 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8048 FROM {table_variants} 8049 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8050 """ 8051 df_variants = self.get_query_to_df(query_variants) 8052 8053 # Added columns 8054 added_columns = [] 8055 8056 # Add hgvs column in variants table 8057 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8058 added_column = self.add_column( 8059 table_variants, hgvs_column_name, "STRING", default_value=None 8060 ) 8061 added_columns.append(added_column) 8062 8063 log.debug(f"refSeq loading...") 8064 # refSeq in duckDB 8065 refseq_table = get_refseq_table( 8066 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8067 ) 8068 # Loading all refSeq in Dataframe 8069 refseq_query = f""" 8070 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8071 FROM {refseq_table} 8072 JOIN df_variants ON ( 8073 {refseq_table}.chrom = df_variants.CHROM 8074 AND {refseq_table}.txStart<=df_variants.POS 8075 AND {refseq_table}.txEnd>=df_variants.POS 8076 ) 8077 """ 8078 refseq_df = self.conn.query(refseq_query).pl() 8079 8080 if refseqlink_file: 8081 log.debug(f"refSeqLink loading...") 8082 # refSeqLink in duckDB 8083 refseqlink_table = get_refseq_table( 8084 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8085 ) 8086 # Loading all refSeqLink in Dataframe 8087 protacc_column = "protAcc_with_ver" 8088 mrnaacc_column = "mrnaAcc_with_ver" 8089 refseqlink_query = f""" 8090 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8091 FROM {refseqlink_table} 8092 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8093 WHERE protAcc_without_ver IS NOT NULL 8094 """ 8095 # Polars Dataframe 8096 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8097 8098 # Read RefSeq transcripts into a python dict/model. 8099 log.debug(f"Transcripts loading...") 8100 with tempfile.TemporaryDirectory() as tmpdir: 8101 transcripts_query = f""" 8102 COPY ( 8103 SELECT {refseq_table}.* 8104 FROM {refseq_table} 8105 JOIN df_variants ON ( 8106 {refseq_table}.chrom=df_variants.CHROM 8107 AND {refseq_table}.txStart<=df_variants.POS 8108 AND {refseq_table}.txEnd>=df_variants.POS 8109 ) 8110 ) 8111 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8112 """ 8113 self.conn.query(transcripts_query) 8114 with open(f"{tmpdir}/transcript.tsv") as infile: 8115 transcripts = read_transcripts(infile) 8116 8117 # Polars connexion 8118 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8119 8120 log.debug("Genome loading...") 8121 # Read genome sequence using pyfaidx. 8122 genome = Fasta(genome_file) 8123 8124 log.debug("Start annotation HGVS...") 8125 8126 # Create 8127 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8128 ddf = dd.from_pandas(df_variants, npartitions=threads) 8129 8130 # Use dask.dataframe.apply() to apply function on each partition 8131 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8132 8133 # Convert Dask DataFrame to Pandas Dataframe 8134 df = ddf.compute() 8135 8136 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8137 with tempfile.TemporaryDirectory() as tmpdir: 8138 df_parquet = os.path.join(tmpdir, "df.parquet") 8139 df.to_parquet(df_parquet) 8140 8141 # Update hgvs column 8142 update_variant_query = f""" 8143 UPDATE {table_variants} 8144 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8145 FROM read_parquet('{df_parquet}') as df 8146 WHERE variants."#CHROM" = df.CHROM 8147 AND variants.POS = df.POS 8148 AND variants.REF = df.REF 8149 AND variants.ALT = df.ALT 8150 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8151 """ 8152 self.execute_query(update_variant_query) 8153 8154 # Update INFO column 8155 sql_query_update = f""" 8156 UPDATE {table_variants} 8157 SET INFO = 8158 concat( 8159 CASE 8160 WHEN INFO NOT IN ('','.') 8161 THEN concat(INFO, ';') 8162 ELSE '' 8163 END, 8164 'hgvs=', 8165 {hgvs_column_name} 8166 ) 8167 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8168 """ 8169 self.execute_query(sql_query_update) 8170 8171 # Add header 8172 HGVS_INFOS = { 8173 "hgvs": { 8174 "ID": "hgvs", 8175 "Number": ".", 8176 "Type": "String", 8177 "Description": f"HGVS annotatation with HOWARD", 8178 } 8179 } 8180 8181 for field in HGVS_INFOS: 8182 field_ID = HGVS_INFOS[field]["ID"] 8183 field_description = HGVS_INFOS[field]["Description"] 8184 self.get_header().infos[field_ID] = vcf.parser._Info( 8185 field_ID, 8186 HGVS_INFOS[field]["Number"], 8187 HGVS_INFOS[field]["Type"], 8188 field_description, 8189 "unknown", 8190 "unknown", 8191 code_type_map[HGVS_INFOS[field]["Type"]], 8192 ) 8193 8194 # Remove added columns 8195 for added_column in added_columns: 8196 self.drop_column(column=added_column) 8197 8198 ### 8199 # Calculation 8200 ### 8201 8202 def get_operations_help( 8203 self, operations_config_dict: dict = {}, operations_config_file: str = None 8204 ) -> list: 8205 8206 # Init 8207 operations_help = [] 8208 8209 # operations 8210 operations = self.get_config_json( 8211 name="calculations", 8212 config_dict=operations_config_dict, 8213 config_file=operations_config_file, 8214 ) 8215 for op in operations: 8216 op_name = operations[op].get("name", op).upper() 8217 op_description = operations[op].get("description", op_name) 8218 op_available = operations[op].get("available", False) 8219 if op_available: 8220 operations_help.append(f" {op_name}: {op_description}") 8221 8222 # Sort operations 8223 operations_help.sort() 8224 8225 # insert header 8226 operations_help.insert(0, "Available calculation operations:") 8227 8228 # Return 8229 return operations_help 8230 8231 def calculation( 8232 self, 8233 operations: dict = {}, 8234 operations_config_dict: dict = {}, 8235 operations_config_file: str = None, 8236 ) -> None: 8237 """ 8238 It takes a list of operations, and for each operation, it checks if it's a python or sql 8239 operation, and then calls the appropriate function 8240 8241 param json example: 8242 "calculation": { 8243 "NOMEN": { 8244 "options": { 8245 "hgvs_field": "hgvs" 8246 }, 8247 "middle" : null 8248 } 8249 """ 8250 8251 # Param 8252 param = self.get_param() 8253 8254 # CHeck operations config file 8255 if operations_config_file is None: 8256 operations_config_file = param.get("calculation", {}).get( 8257 "calculation_config", None 8258 ) 8259 8260 # operations config 8261 operations_config = self.get_config_json( 8262 name="calculations", 8263 config_dict=operations_config_dict, 8264 config_file=operations_config_file, 8265 ) 8266 8267 # Upper keys 8268 operations_config = {k.upper(): v for k, v in operations_config.items()} 8269 8270 # Calculations 8271 8272 # Operations from param 8273 operations = param.get("calculation", {}).get("calculations", operations) 8274 8275 # Quick calculation - add 8276 if param.get("calculations", None): 8277 8278 # List of operations 8279 calculations_list = [ 8280 value.strip() for value in param.get("calculations", "").split(",") 8281 ] 8282 8283 # Log 8284 log.info(f"Quick Calculations:") 8285 for calculation_key in calculations_list: 8286 log.info(f" {calculation_key}") 8287 8288 # Create tmp operations (to keep operation order) 8289 operations_tmp = {} 8290 for calculation_operation in calculations_list: 8291 if calculation_operation.upper() not in operations_tmp: 8292 log.debug( 8293 f"{calculation_operation}.upper() not in {operations_tmp}" 8294 ) 8295 operations_tmp[calculation_operation.upper()] = {} 8296 add_value_into_dict( 8297 dict_tree=operations_tmp, 8298 sections=[ 8299 calculation_operation.upper(), 8300 ], 8301 value=operations.get(calculation_operation.upper(), {}), 8302 ) 8303 # Add operations already in param 8304 for calculation_operation in operations: 8305 if calculation_operation not in operations_tmp: 8306 operations_tmp[calculation_operation] = operations.get( 8307 calculation_operation, {} 8308 ) 8309 8310 # Update operations in param 8311 operations = operations_tmp 8312 8313 # Operations for calculation 8314 if not operations: 8315 operations = param.get("calculation", {}).get("calculations", {}) 8316 8317 if operations: 8318 log.info(f"Calculations...") 8319 8320 # For each operations 8321 for operation_name in operations: 8322 operation_name = operation_name.upper() 8323 if operation_name not in [""]: 8324 if operation_name in operations_config: 8325 log.info(f"Calculation '{operation_name}'") 8326 operation = operations_config[operation_name] 8327 operation_type = operation.get("type", "sql") 8328 if operation_type == "python": 8329 self.calculation_process_function( 8330 operation=operation, operation_name=operation_name 8331 ) 8332 elif operation_type == "sql": 8333 self.calculation_process_sql( 8334 operation=operation, operation_name=operation_name 8335 ) 8336 else: 8337 log.error( 8338 f"Operations config: Type '{operation_type}' NOT available" 8339 ) 8340 raise ValueError( 8341 f"Operations config: Type '{operation_type}' NOT available" 8342 ) 8343 else: 8344 log.error( 8345 f"Operations config: Calculation '{operation_name}' NOT available" 8346 ) 8347 raise ValueError( 8348 f"Operations config: Calculation '{operation_name}' NOT available" 8349 ) 8350 8351 # Explode INFOS fields into table fields 8352 if self.get_explode_infos(): 8353 self.explode_infos( 8354 prefix=self.get_explode_infos_prefix(), 8355 fields=self.get_explode_infos_fields(), 8356 force=True, 8357 ) 8358 8359 def calculation_process_sql( 8360 self, operation: dict, operation_name: str = "unknown" 8361 ) -> None: 8362 """ 8363 The `calculation_process_sql` function takes in a mathematical operation as a string and 8364 performs the operation, updating the specified table with the result. 8365 8366 :param operation: The `operation` parameter is a dictionary that contains information about the 8367 mathematical operation to be performed. It includes the following keys: 8368 :type operation: dict 8369 :param operation_name: The `operation_name` parameter is a string that represents the name of 8370 the mathematical operation being performed. It is used for logging and error handling purposes, 8371 defaults to unknown 8372 :type operation_name: str (optional) 8373 """ 8374 8375 # Operation infos 8376 operation_name = operation.get("name", "unknown") 8377 log.debug(f"process SQL {operation_name}") 8378 output_column_name = operation.get("output_column_name", operation_name) 8379 output_column_type = operation.get("output_column_type", "String") 8380 prefix = operation.get("explode_infos_prefix", "") 8381 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8382 output_column_description = operation.get( 8383 "output_column_description", f"{operation_name} operation" 8384 ) 8385 operation_query = operation.get("operation_query", None) 8386 if isinstance(operation_query, list): 8387 operation_query = " ".join(operation_query) 8388 operation_info_fields = operation.get("info_fields", []) 8389 operation_info_fields_check = operation.get("info_fields_check", False) 8390 operation_info = operation.get("operation_info", True) 8391 operation_table = operation.get( 8392 "table", self.get_table_variants(clause="alter") 8393 ) 8394 8395 # table variants 8396 if operation_table: 8397 table_variants = operation_table 8398 else: 8399 table_variants = self.get_table_variants(clause="alter") 8400 8401 if operation_query: 8402 8403 # Info fields check 8404 operation_info_fields_check_result = True 8405 if operation_info_fields_check: 8406 header_infos = self.get_header().infos 8407 for info_field in operation_info_fields: 8408 operation_info_fields_check_result = ( 8409 operation_info_fields_check_result 8410 and info_field in header_infos 8411 ) 8412 8413 # If info fields available 8414 if operation_info_fields_check_result: 8415 8416 # Added_columns 8417 added_columns = [] 8418 8419 # Create VCF header field 8420 vcf_reader = self.get_header() 8421 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8422 output_column_name, 8423 ".", 8424 output_column_type, 8425 output_column_description, 8426 "howard calculation", 8427 "0", 8428 self.code_type_map.get(output_column_type), 8429 ) 8430 8431 # Explode infos if needed 8432 log.debug(f"calculation_process_sql prefix {prefix}") 8433 added_columns += self.explode_infos( 8434 prefix=prefix, 8435 fields=[output_column_name] + operation_info_fields, 8436 force=False, 8437 table=table_variants, 8438 ) 8439 8440 # Create column 8441 added_column = self.add_column( 8442 table_name=table_variants, 8443 column_name=prefix + output_column_name, 8444 column_type=output_column_type_sql, 8445 default_value="null", 8446 ) 8447 added_columns.append(added_column) 8448 8449 # Operation calculation 8450 try: 8451 8452 # Query to update calculation column 8453 sql_update = f""" 8454 UPDATE {table_variants} 8455 SET "{prefix}{output_column_name}" = ({operation_query}) 8456 """ 8457 self.conn.execute(sql_update) 8458 8459 # Add to INFO 8460 if operation_info: 8461 sql_update_info = f""" 8462 UPDATE {table_variants} 8463 SET "INFO" = 8464 concat( 8465 CASE 8466 WHEN "INFO" IS NOT NULL 8467 THEN concat("INFO", ';') 8468 ELSE '' 8469 END, 8470 '{output_column_name}=', 8471 "{prefix}{output_column_name}" 8472 ) 8473 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8474 """ 8475 self.conn.execute(sql_update_info) 8476 8477 except: 8478 log.error( 8479 f"Operations config: Calculation '{operation_name}' query failed" 8480 ) 8481 raise ValueError( 8482 f"Operations config: Calculation '{operation_name}' query failed" 8483 ) 8484 8485 # Remove added columns 8486 for added_column in added_columns: 8487 log.debug(f"added_column: {added_column}") 8488 self.drop_column(column=added_column) 8489 8490 else: 8491 log.error( 8492 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8493 ) 8494 raise ValueError( 8495 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8496 ) 8497 8498 else: 8499 log.error( 8500 f"Operations config: Calculation '{operation_name}' query NOT defined" 8501 ) 8502 raise ValueError( 8503 f"Operations config: Calculation '{operation_name}' query NOT defined" 8504 ) 8505 8506 def calculation_process_function( 8507 self, operation: dict, operation_name: str = "unknown" 8508 ) -> None: 8509 """ 8510 The `calculation_process_function` takes in an operation dictionary and performs the specified 8511 function with the given parameters. 8512 8513 :param operation: The `operation` parameter is a dictionary that contains information about the 8514 operation to be performed. It has the following keys: 8515 :type operation: dict 8516 :param operation_name: The `operation_name` parameter is a string that represents the name of 8517 the operation being performed. It is used for logging purposes, defaults to unknown 8518 :type operation_name: str (optional) 8519 """ 8520 8521 operation_name = operation["name"] 8522 log.debug(f"process Python {operation_name}") 8523 function_name = operation["function_name"] 8524 function_params = operation["function_params"] 8525 getattr(self, function_name)(*function_params) 8526 8527 def calculation_variant_id(self) -> None: 8528 """ 8529 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8530 updates the INFO field of a variants table with the variant ID. 8531 """ 8532 8533 # variant_id annotation field 8534 variant_id_tag = self.get_variant_id_column() 8535 added_columns = [variant_id_tag] 8536 8537 # variant_id hgvs tags" 8538 vcf_infos_tags = { 8539 variant_id_tag: "howard variant ID annotation", 8540 } 8541 8542 # Variants table 8543 table_variants = self.get_table_variants() 8544 8545 # Header 8546 vcf_reader = self.get_header() 8547 8548 # Add variant_id to header 8549 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8550 variant_id_tag, 8551 ".", 8552 "String", 8553 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8554 "howard calculation", 8555 "0", 8556 self.code_type_map.get("String"), 8557 ) 8558 8559 # Update 8560 sql_update = f""" 8561 UPDATE {table_variants} 8562 SET "INFO" = 8563 concat( 8564 CASE 8565 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8566 THEN '' 8567 ELSE concat("INFO", ';') 8568 END, 8569 '{variant_id_tag}=', 8570 "{variant_id_tag}" 8571 ) 8572 """ 8573 self.conn.execute(sql_update) 8574 8575 # Remove added columns 8576 for added_column in added_columns: 8577 self.drop_column(column=added_column) 8578 8579 def calculation_extract_snpeff_hgvs( 8580 self, 8581 snpeff_hgvs: str = "snpeff_hgvs", 8582 snpeff_field: str = "ANN", 8583 ) -> None: 8584 """ 8585 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8586 annotation field in a VCF file and adds them as a new column in the variants table. 8587 8588 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8589 function is used to specify the name of the column that will store the HGVS nomenclatures 8590 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8591 snpeff_hgvs 8592 :type snpeff_hgvs: str (optional) 8593 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8594 function represents the field in the VCF file that contains SnpEff annotations. This field is 8595 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8596 to ANN 8597 :type snpeff_field: str (optional) 8598 """ 8599 8600 # Snpeff hgvs tags 8601 vcf_infos_tags = { 8602 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8603 } 8604 8605 # Prefix 8606 prefix = self.get_explode_infos_prefix() 8607 if prefix: 8608 prefix = "INFO/" 8609 8610 # snpEff fields 8611 speff_ann_infos = prefix + snpeff_field 8612 speff_hgvs_infos = prefix + snpeff_hgvs 8613 8614 # Variants table 8615 table_variants = self.get_table_variants() 8616 8617 # Header 8618 vcf_reader = self.get_header() 8619 8620 # Add columns 8621 added_columns = [] 8622 8623 # Explode HGVS field in column 8624 added_columns += self.explode_infos(fields=[snpeff_field]) 8625 8626 if snpeff_field in vcf_reader.infos: 8627 8628 log.debug(vcf_reader.infos[snpeff_field]) 8629 8630 # Extract ANN header 8631 ann_description = vcf_reader.infos[snpeff_field].desc 8632 pattern = r"'(.+?)'" 8633 match = re.search(pattern, ann_description) 8634 if match: 8635 ann_header_match = match.group(1).split(" | ") 8636 ann_header_desc = {} 8637 for i in range(len(ann_header_match)): 8638 ann_header_info = "".join( 8639 char for char in ann_header_match[i] if char.isalnum() 8640 ) 8641 ann_header_desc[ann_header_info] = ann_header_match[i] 8642 if not ann_header_desc: 8643 raise ValueError("Invalid header description format") 8644 else: 8645 raise ValueError("Invalid header description format") 8646 8647 # Create variant id 8648 variant_id_column = self.get_variant_id_column() 8649 added_columns += [variant_id_column] 8650 8651 # Create dataframe 8652 dataframe_snpeff_hgvs = self.get_query_to_df( 8653 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8654 ) 8655 8656 # Create main NOMEN column 8657 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8658 speff_ann_infos 8659 ].apply( 8660 lambda x: extract_snpeff_hgvs( 8661 str(x), header=list(ann_header_desc.values()) 8662 ) 8663 ) 8664 8665 # Add snpeff_hgvs to header 8666 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8667 snpeff_hgvs, 8668 ".", 8669 "String", 8670 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8671 "howard calculation", 8672 "0", 8673 self.code_type_map.get("String"), 8674 ) 8675 8676 # Update 8677 sql_update = f""" 8678 UPDATE variants 8679 SET "INFO" = 8680 concat( 8681 CASE 8682 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8683 THEN '' 8684 ELSE concat("INFO", ';') 8685 END, 8686 CASE 8687 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8688 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8689 THEN concat( 8690 '{snpeff_hgvs}=', 8691 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8692 ) 8693 ELSE '' 8694 END 8695 ) 8696 FROM dataframe_snpeff_hgvs 8697 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8698 8699 """ 8700 self.conn.execute(sql_update) 8701 8702 # Delete dataframe 8703 del dataframe_snpeff_hgvs 8704 gc.collect() 8705 8706 else: 8707 8708 log.warning( 8709 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8710 ) 8711 8712 # Remove added columns 8713 for added_column in added_columns: 8714 self.drop_column(column=added_column) 8715 8716 def calculation_snpeff_ann_explode( 8717 self, 8718 uniquify: bool = True, 8719 output_format: str = "fields", 8720 output_prefix: str = "snpeff_", 8721 snpeff_field: str = "ANN", 8722 ) -> None: 8723 """ 8724 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8725 exploding the HGVS field and updating variant information accordingly. 8726 8727 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8728 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8729 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8730 defaults to True 8731 :type uniquify: bool (optional) 8732 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8733 function specifies the format in which the output annotations will be generated. It has a 8734 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8735 format, defaults to fields 8736 :type output_format: str (optional) 8737 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8738 method is used to specify the prefix that will be added to the output annotations generated 8739 during the calculation process. This prefix helps to differentiate the newly added annotations 8740 from existing ones in the output data. By default, the, defaults to ANN_ 8741 :type output_prefix: str (optional) 8742 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8743 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8744 field will be processed to explode the HGVS annotations and update the variant information 8745 accordingly, defaults to ANN 8746 :type snpeff_field: str (optional) 8747 """ 8748 8749 # SnpEff annotation field 8750 snpeff_hgvs = "snpeff_ann_explode" 8751 8752 # Snpeff hgvs tags 8753 vcf_infos_tags = { 8754 snpeff_hgvs: "Explode snpEff annotations", 8755 } 8756 8757 # Prefix 8758 prefix = self.get_explode_infos_prefix() 8759 if prefix: 8760 prefix = "INFO/" 8761 8762 # snpEff fields 8763 speff_ann_infos = prefix + snpeff_field 8764 speff_hgvs_infos = prefix + snpeff_hgvs 8765 8766 # Variants table 8767 table_variants = self.get_table_variants() 8768 8769 # Header 8770 vcf_reader = self.get_header() 8771 8772 # Add columns 8773 added_columns = [] 8774 8775 # Explode HGVS field in column 8776 added_columns += self.explode_infos(fields=[snpeff_field]) 8777 log.debug(f"snpeff_field={snpeff_field}") 8778 log.debug(f"added_columns={added_columns}") 8779 8780 if snpeff_field in vcf_reader.infos: 8781 8782 # Extract ANN header 8783 ann_description = vcf_reader.infos[snpeff_field].desc 8784 pattern = r"'(.+?)'" 8785 match = re.search(pattern, ann_description) 8786 if match: 8787 ann_header_match = match.group(1).split(" | ") 8788 ann_header = [] 8789 ann_header_desc = {} 8790 for i in range(len(ann_header_match)): 8791 ann_header_info = "".join( 8792 char for char in ann_header_match[i] if char.isalnum() 8793 ) 8794 ann_header.append(ann_header_info) 8795 ann_header_desc[ann_header_info] = ann_header_match[i] 8796 if not ann_header_desc: 8797 raise ValueError("Invalid header description format") 8798 else: 8799 raise ValueError("Invalid header description format") 8800 8801 # Create variant id 8802 variant_id_column = self.get_variant_id_column() 8803 added_columns += [variant_id_column] 8804 8805 # Create dataframe 8806 dataframe_snpeff_hgvs = self.get_query_to_df( 8807 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8808 ) 8809 8810 # Create snpEff columns 8811 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8812 speff_ann_infos 8813 ].apply( 8814 lambda x: explode_snpeff_ann( 8815 str(x), 8816 uniquify=uniquify, 8817 output_format=output_format, 8818 prefix=output_prefix, 8819 header=list(ann_header_desc.values()), 8820 ) 8821 ) 8822 8823 # Header 8824 ann_annotations_prefix = "" 8825 if output_format.upper() in ["JSON"]: 8826 ann_annotations_prefix = f"{output_prefix}=" 8827 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8828 output_prefix, 8829 ".", 8830 "String", 8831 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8832 + " - JSON format", 8833 "howard calculation", 8834 "0", 8835 self.code_type_map.get("String"), 8836 ) 8837 else: 8838 for ann_annotation in ann_header: 8839 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8840 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8841 ann_annotation_id, 8842 ".", 8843 "String", 8844 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8845 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8846 "howard calculation", 8847 "0", 8848 self.code_type_map.get("String"), 8849 ) 8850 8851 # Update 8852 sql_update = f""" 8853 UPDATE variants 8854 SET "INFO" = 8855 concat( 8856 CASE 8857 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8858 THEN '' 8859 ELSE concat("INFO", ';') 8860 END, 8861 CASE 8862 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8863 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8864 THEN concat( 8865 '{ann_annotations_prefix}', 8866 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8867 ) 8868 ELSE '' 8869 END 8870 ) 8871 FROM dataframe_snpeff_hgvs 8872 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8873 8874 """ 8875 self.conn.execute(sql_update) 8876 8877 # Delete dataframe 8878 del dataframe_snpeff_hgvs 8879 gc.collect() 8880 8881 else: 8882 8883 log.warning( 8884 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8885 ) 8886 8887 # Remove added columns 8888 for added_column in added_columns: 8889 self.drop_column(column=added_column) 8890 8891 def calculation_extract_nomen(self) -> None: 8892 """ 8893 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8894 """ 8895 8896 # NOMEN field 8897 field_nomen_dict = "NOMEN_DICT" 8898 8899 # NOMEN structure 8900 nomen_dict = { 8901 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8902 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8903 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8904 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8905 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8906 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8907 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8908 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8909 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8910 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8911 } 8912 8913 # Param 8914 param = self.get_param() 8915 8916 # Threads 8917 threads = self.get_threads() 8918 8919 # Prefix 8920 prefix = self.get_explode_infos_prefix() 8921 8922 # Header 8923 vcf_reader = self.get_header() 8924 8925 # Added columns 8926 added_columns = [] 8927 8928 # Get HGVS field 8929 hgvs_field = ( 8930 param.get("calculation", {}) 8931 .get("calculations", {}) 8932 .get("NOMEN", {}) 8933 .get("options", {}) 8934 .get("hgvs_field", "hgvs") 8935 ) 8936 8937 # Get NOMEN pattern 8938 nomen_pattern = ( 8939 param.get("calculation", {}) 8940 .get("calculations", {}) 8941 .get("NOMEN", {}) 8942 .get("options", {}) 8943 .get("pattern", None) 8944 ) 8945 8946 # transcripts list of preference sources 8947 transcripts_sources = {} 8948 8949 # Get transcripts 8950 transcripts_file = ( 8951 param.get("calculation", {}) 8952 .get("calculations", {}) 8953 .get("NOMEN", {}) 8954 .get("options", {}) 8955 .get("transcripts", None) 8956 ) 8957 transcripts_file = full_path(transcripts_file) 8958 if transcripts_file: 8959 if os.path.exists(transcripts_file): 8960 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8961 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8962 transcripts_sources["file"] = transcripts_from_file 8963 else: 8964 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8965 log.error(msg_err) 8966 raise ValueError(msg_err) 8967 8968 # Get transcripts table 8969 transcripts_table = ( 8970 param.get("calculation", {}) 8971 .get("calculations", {}) 8972 .get("NOMEN", {}) 8973 .get("options", {}) 8974 .get("transcripts_table", self.get_table_variants()) 8975 ) 8976 # Get transcripts column 8977 transcripts_column = ( 8978 param.get("calculation", {}) 8979 .get("calculations", {}) 8980 .get("NOMEN", {}) 8981 .get("options", {}) 8982 .get("transcripts_column", None) 8983 ) 8984 8985 if transcripts_table and transcripts_column: 8986 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8987 # Explode if not exists 8988 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8989 else: 8990 extra_field_transcript = f"NULL" 8991 8992 # Transcripts of preference source order 8993 transcripts_order = ( 8994 param.get("calculation", {}) 8995 .get("calculations", {}) 8996 .get("NOMEN", {}) 8997 .get("options", {}) 8998 .get("transcripts_order", ["column", "file"]) 8999 ) 9000 9001 # Transcripts from file 9002 transcripts = transcripts_sources.get("file", []) 9003 9004 # Explode HGVS field in column 9005 added_columns += self.explode_infos(fields=[hgvs_field]) 9006 9007 # extra infos 9008 extra_infos = self.get_extra_infos() 9009 extra_field = prefix + hgvs_field 9010 9011 if extra_field in extra_infos: 9012 9013 # Create dataframe 9014 dataframe_hgvs = self.get_query_to_df( 9015 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9016 ) 9017 9018 # Transcripts rank 9019 transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)} 9020 transcripts_len = len(transcripts_rank) 9021 9022 # Create main NOMEN column 9023 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9024 lambda x: find_nomen( 9025 hgvs=x.hgvs, 9026 transcript=x.transcript, 9027 transcripts=transcripts_rank, 9028 pattern=nomen_pattern, 9029 transcripts_source_order=transcripts_order, 9030 transcripts_len=transcripts_len 9031 ), 9032 axis=1, 9033 ) 9034 9035 # Explode NOMEN Structure and create SQL set for update 9036 sql_nomen_fields = [] 9037 for nomen_field in nomen_dict: 9038 9039 # Create VCF header field 9040 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9041 nomen_field, 9042 ".", 9043 "String", 9044 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9045 "howard calculation", 9046 "0", 9047 self.code_type_map.get("String"), 9048 ) 9049 9050 # Add field to SQL query update 9051 sql_nomen_fields.append( 9052 f""" 9053 CASE 9054 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9055 THEN concat( 9056 ';{nomen_field}=', 9057 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9058 ) 9059 ELSE '' 9060 END 9061 """ 9062 ) 9063 9064 # SQL set for update 9065 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9066 9067 # Update 9068 sql_update = f""" 9069 UPDATE variants 9070 SET "INFO" = 9071 concat( 9072 CASE 9073 WHEN "INFO" IS NULL 9074 THEN '' 9075 ELSE "INFO" 9076 END, 9077 {sql_nomen_fields_set} 9078 ) 9079 FROM dataframe_hgvs 9080 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9081 AND variants."POS" = dataframe_hgvs."POS" 9082 AND variants."REF" = dataframe_hgvs."REF" 9083 AND variants."ALT" = dataframe_hgvs."ALT" 9084 """ 9085 self.conn.execute(sql_update) 9086 9087 # Delete dataframe 9088 del dataframe_hgvs 9089 gc.collect() 9090 9091 # Remove added columns 9092 for added_column in added_columns: 9093 self.drop_column(column=added_column) 9094 9095 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9096 """ 9097 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9098 pipeline/sample for a variant and updates the variant information in a VCF file. 9099 9100 :param tag: The `tag` parameter is a string that represents the annotation field for the 9101 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9102 VCF header and to update the corresponding field in the variants table, defaults to 9103 findbypipeline 9104 :type tag: str (optional) 9105 """ 9106 9107 # if FORMAT and samples 9108 if ( 9109 "FORMAT" in self.get_header_columns_as_list() 9110 and self.get_header_sample_list() 9111 ): 9112 9113 # findbypipeline annotation field 9114 findbypipeline_tag = tag 9115 9116 # VCF infos tags 9117 vcf_infos_tags = { 9118 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9119 } 9120 9121 # Prefix 9122 prefix = self.get_explode_infos_prefix() 9123 9124 # Field 9125 findbypipeline_infos = prefix + findbypipeline_tag 9126 9127 # Variants table 9128 table_variants = self.get_table_variants() 9129 9130 # Header 9131 vcf_reader = self.get_header() 9132 9133 # Create variant id 9134 variant_id_column = self.get_variant_id_column() 9135 added_columns = [variant_id_column] 9136 9137 # variant_id, FORMAT and samples 9138 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9139 self.get_header_sample_list() 9140 ) 9141 9142 # Create dataframe 9143 dataframe_findbypipeline = self.get_query_to_df( 9144 f""" SELECT {samples_fields} FROM {table_variants} """ 9145 ) 9146 9147 # Create findbypipeline column 9148 dataframe_findbypipeline[findbypipeline_infos] = ( 9149 dataframe_findbypipeline.apply( 9150 lambda row: findbypipeline( 9151 row, samples=self.get_header_sample_list() 9152 ), 9153 axis=1, 9154 ) 9155 ) 9156 9157 # Add snpeff_hgvs to header 9158 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9159 findbypipeline_tag, 9160 ".", 9161 "String", 9162 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9163 "howard calculation", 9164 "0", 9165 self.code_type_map.get("String"), 9166 ) 9167 9168 # Update 9169 sql_update = f""" 9170 UPDATE variants 9171 SET "INFO" = 9172 concat( 9173 CASE 9174 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9175 THEN '' 9176 ELSE concat("INFO", ';') 9177 END, 9178 CASE 9179 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9180 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9181 THEN concat( 9182 '{findbypipeline_tag}=', 9183 dataframe_findbypipeline."{findbypipeline_infos}" 9184 ) 9185 ELSE '' 9186 END 9187 ) 9188 FROM dataframe_findbypipeline 9189 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9190 """ 9191 self.conn.execute(sql_update) 9192 9193 # Remove added columns 9194 for added_column in added_columns: 9195 self.drop_column(column=added_column) 9196 9197 # Delete dataframe 9198 del dataframe_findbypipeline 9199 gc.collect() 9200 9201 def calculation_genotype_concordance(self) -> None: 9202 """ 9203 The function `calculation_genotype_concordance` calculates the genotype concordance for 9204 multi-caller VCF files and updates the variant information in the database. 9205 """ 9206 9207 # if FORMAT and samples 9208 if ( 9209 "FORMAT" in self.get_header_columns_as_list() 9210 and self.get_header_sample_list() 9211 ): 9212 9213 # genotypeconcordance annotation field 9214 genotypeconcordance_tag = "genotypeconcordance" 9215 9216 # VCF infos tags 9217 vcf_infos_tags = { 9218 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9219 } 9220 9221 # Prefix 9222 prefix = self.get_explode_infos_prefix() 9223 9224 # Field 9225 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9226 9227 # Variants table 9228 table_variants = self.get_table_variants() 9229 9230 # Header 9231 vcf_reader = self.get_header() 9232 9233 # Create variant id 9234 variant_id_column = self.get_variant_id_column() 9235 added_columns = [variant_id_column] 9236 9237 # variant_id, FORMAT and samples 9238 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9239 self.get_header_sample_list() 9240 ) 9241 9242 # Create dataframe 9243 dataframe_genotypeconcordance = self.get_query_to_df( 9244 f""" SELECT {samples_fields} FROM {table_variants} """ 9245 ) 9246 9247 # Create genotypeconcordance column 9248 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9249 dataframe_genotypeconcordance.apply( 9250 lambda row: genotypeconcordance( 9251 row, samples=self.get_header_sample_list() 9252 ), 9253 axis=1, 9254 ) 9255 ) 9256 9257 # Add genotypeconcordance to header 9258 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9259 genotypeconcordance_tag, 9260 ".", 9261 "String", 9262 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9263 "howard calculation", 9264 "0", 9265 self.code_type_map.get("String"), 9266 ) 9267 9268 # Update 9269 sql_update = f""" 9270 UPDATE variants 9271 SET "INFO" = 9272 concat( 9273 CASE 9274 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9275 THEN '' 9276 ELSE concat("INFO", ';') 9277 END, 9278 CASE 9279 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9280 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9281 THEN concat( 9282 '{genotypeconcordance_tag}=', 9283 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9284 ) 9285 ELSE '' 9286 END 9287 ) 9288 FROM dataframe_genotypeconcordance 9289 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9290 """ 9291 self.conn.execute(sql_update) 9292 9293 # Remove added columns 9294 for added_column in added_columns: 9295 self.drop_column(column=added_column) 9296 9297 # Delete dataframe 9298 del dataframe_genotypeconcordance 9299 gc.collect() 9300 9301 def calculation_barcode(self, tag: str = "barcode") -> None: 9302 """ 9303 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9304 updates the INFO field in the file with the calculated barcode values. 9305 9306 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9307 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9308 the default tag name is set to "barcode", defaults to barcode 9309 :type tag: str (optional) 9310 """ 9311 9312 # if FORMAT and samples 9313 if ( 9314 "FORMAT" in self.get_header_columns_as_list() 9315 and self.get_header_sample_list() 9316 ): 9317 9318 # barcode annotation field 9319 if not tag: 9320 tag = "barcode" 9321 9322 # VCF infos tags 9323 vcf_infos_tags = { 9324 tag: "barcode calculation (VaRank)", 9325 } 9326 9327 # Prefix 9328 prefix = self.get_explode_infos_prefix() 9329 9330 # Field 9331 barcode_infos = prefix + tag 9332 9333 # Variants table 9334 table_variants = self.get_table_variants() 9335 9336 # Header 9337 vcf_reader = self.get_header() 9338 9339 # Create variant id 9340 variant_id_column = self.get_variant_id_column() 9341 added_columns = [variant_id_column] 9342 9343 # variant_id, FORMAT and samples 9344 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9345 self.get_header_sample_list() 9346 ) 9347 9348 # Create dataframe 9349 dataframe_barcode = self.get_query_to_df( 9350 f""" SELECT {samples_fields} FROM {table_variants} """ 9351 ) 9352 9353 # Create barcode column 9354 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9355 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9356 ) 9357 9358 # Add barcode to header 9359 vcf_reader.infos[tag] = vcf.parser._Info( 9360 tag, 9361 ".", 9362 "String", 9363 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9364 "howard calculation", 9365 "0", 9366 self.code_type_map.get("String"), 9367 ) 9368 9369 # Update 9370 sql_update = f""" 9371 UPDATE {table_variants} 9372 SET "INFO" = 9373 concat( 9374 CASE 9375 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9376 THEN '' 9377 ELSE concat("INFO", ';') 9378 END, 9379 CASE 9380 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9381 AND dataframe_barcode."{barcode_infos}" NOT NULL 9382 THEN concat( 9383 '{tag}=', 9384 dataframe_barcode."{barcode_infos}" 9385 ) 9386 ELSE '' 9387 END 9388 ) 9389 FROM dataframe_barcode 9390 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9391 """ 9392 self.conn.execute(sql_update) 9393 9394 # Remove added columns 9395 for added_column in added_columns: 9396 self.drop_column(column=added_column) 9397 9398 # Delete dataframe 9399 del dataframe_barcode 9400 gc.collect() 9401 9402 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9403 """ 9404 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9405 and updates the INFO field in the file with the calculated barcode values. 9406 9407 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9408 the barcode tag that will be added to the VCF file during the calculation process. If no value 9409 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9410 :type tag: str (optional) 9411 """ 9412 9413 # if FORMAT and samples 9414 if ( 9415 "FORMAT" in self.get_header_columns_as_list() 9416 and self.get_header_sample_list() 9417 ): 9418 9419 # barcode annotation field 9420 if not tag: 9421 tag = "BCF" 9422 9423 # VCF infos tags 9424 vcf_infos_tags = { 9425 tag: "barcode family calculation", 9426 f"{tag}S": "barcode family samples", 9427 } 9428 9429 # Param 9430 param = self.get_param() 9431 log.debug(f"param={param}") 9432 9433 # Prefix 9434 prefix = self.get_explode_infos_prefix() 9435 9436 # PED param 9437 ped = ( 9438 param.get("calculation", {}) 9439 .get("calculations", {}) 9440 .get("BARCODEFAMILY", {}) 9441 .get("family_pedigree", None) 9442 ) 9443 log.debug(f"ped={ped}") 9444 9445 # Load PED 9446 if ped: 9447 9448 # Pedigree is a file 9449 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9450 log.debug("Pedigree is file") 9451 with open(full_path(ped)) as ped: 9452 ped = yaml.safe_load(ped) 9453 9454 # Pedigree is a string 9455 elif isinstance(ped, str): 9456 log.debug("Pedigree is str") 9457 try: 9458 ped = json.loads(ped) 9459 log.debug("Pedigree is json str") 9460 except ValueError as e: 9461 ped_samples = ped.split(",") 9462 ped = {} 9463 for ped_sample in ped_samples: 9464 ped[ped_sample] = ped_sample 9465 9466 # Pedigree is a dict 9467 elif isinstance(ped, dict): 9468 log.debug("Pedigree is dict") 9469 9470 # Pedigree is not well formatted 9471 else: 9472 msg_error = "Pedigree not well formatted" 9473 log.error(msg_error) 9474 raise ValueError(msg_error) 9475 9476 # Construct list 9477 ped_samples = list(ped.values()) 9478 9479 else: 9480 log.debug("Pedigree not defined. Take all samples") 9481 ped_samples = self.get_header_sample_list() 9482 ped = {} 9483 for ped_sample in ped_samples: 9484 ped[ped_sample] = ped_sample 9485 9486 # Check pedigree 9487 if not ped or len(ped) == 0: 9488 msg_error = f"Error in pedigree: samples {ped_samples}" 9489 log.error(msg_error) 9490 raise ValueError(msg_error) 9491 9492 # Log 9493 log.info( 9494 "Calculation 'BARCODEFAMILY' - Samples: " 9495 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9496 ) 9497 log.debug(f"ped_samples={ped_samples}") 9498 9499 # Field 9500 barcode_infos = prefix + tag 9501 9502 # Variants table 9503 table_variants = self.get_table_variants() 9504 9505 # Header 9506 vcf_reader = self.get_header() 9507 9508 # Create variant id 9509 variant_id_column = self.get_variant_id_column() 9510 added_columns = [variant_id_column] 9511 9512 # variant_id, FORMAT and samples 9513 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9514 ped_samples 9515 ) 9516 9517 # Create dataframe 9518 dataframe_barcode = self.get_query_to_df( 9519 f""" SELECT {samples_fields} FROM {table_variants} """ 9520 ) 9521 9522 # Create barcode column 9523 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9524 lambda row: barcode(row, samples=ped_samples), axis=1 9525 ) 9526 9527 # Add barcode family to header 9528 # Add vaf_normalization to header 9529 vcf_reader.formats[tag] = vcf.parser._Format( 9530 id=tag, 9531 num=".", 9532 type="String", 9533 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9534 type_code=self.code_type_map.get("String"), 9535 ) 9536 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9537 id=f"{tag}S", 9538 num=".", 9539 type="String", 9540 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9541 type_code=self.code_type_map.get("String"), 9542 ) 9543 9544 # Update 9545 # for sample in ped_samples: 9546 sql_update_set = [] 9547 for sample in self.get_header_sample_list() + ["FORMAT"]: 9548 if sample in ped_samples: 9549 value = f'dataframe_barcode."{barcode_infos}"' 9550 value_samples = "'" + ",".join(ped_samples) + "'" 9551 elif sample == "FORMAT": 9552 value = f"'{tag}'" 9553 value_samples = f"'{tag}S'" 9554 else: 9555 value = "'.'" 9556 value_samples = "'.'" 9557 format_regex = r"[a-zA-Z0-9\s]" 9558 sql_update_set.append( 9559 f""" 9560 "{sample}" = 9561 concat( 9562 CASE 9563 WHEN {table_variants}."{sample}" = './.' 9564 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9565 ELSE {table_variants}."{sample}" 9566 END, 9567 ':', 9568 {value}, 9569 ':', 9570 {value_samples} 9571 ) 9572 """ 9573 ) 9574 9575 sql_update_set_join = ", ".join(sql_update_set) 9576 sql_update = f""" 9577 UPDATE {table_variants} 9578 SET {sql_update_set_join} 9579 FROM dataframe_barcode 9580 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9581 """ 9582 self.conn.execute(sql_update) 9583 9584 # Remove added columns 9585 for added_column in added_columns: 9586 self.drop_column(column=added_column) 9587 9588 # Delete dataframe 9589 del dataframe_barcode 9590 gc.collect() 9591 9592 def calculation_trio(self) -> None: 9593 """ 9594 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9595 information to the INFO field of each variant. 9596 """ 9597 9598 # if FORMAT and samples 9599 if ( 9600 "FORMAT" in self.get_header_columns_as_list() 9601 and self.get_header_sample_list() 9602 ): 9603 9604 # trio annotation field 9605 trio_tag = "trio" 9606 9607 # VCF infos tags 9608 vcf_infos_tags = { 9609 "trio": "trio calculation", 9610 } 9611 9612 # Param 9613 param = self.get_param() 9614 9615 # Prefix 9616 prefix = self.get_explode_infos_prefix() 9617 9618 # Trio param 9619 trio_ped = ( 9620 param.get("calculation", {}) 9621 .get("calculations", {}) 9622 .get("TRIO", {}) 9623 .get("trio_pedigree", None) 9624 ) 9625 9626 # Load trio 9627 if trio_ped: 9628 9629 # Trio pedigree is a file 9630 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9631 log.debug("TRIO pedigree is file") 9632 with open(full_path(trio_ped)) as trio_ped: 9633 trio_ped = yaml.safe_load(trio_ped) 9634 9635 # Trio pedigree is a string 9636 elif isinstance(trio_ped, str): 9637 log.debug("TRIO pedigree is str") 9638 try: 9639 trio_ped = json.loads(trio_ped) 9640 log.debug("TRIO pedigree is json str") 9641 except ValueError as e: 9642 trio_samples = trio_ped.split(",") 9643 if len(trio_samples) == 3: 9644 trio_ped = { 9645 "father": trio_samples[0], 9646 "mother": trio_samples[1], 9647 "child": trio_samples[2], 9648 } 9649 log.debug("TRIO pedigree is list str") 9650 else: 9651 msg_error = "TRIO pedigree not well formatted" 9652 log.error(msg_error) 9653 raise ValueError(msg_error) 9654 9655 # Trio pedigree is a dict 9656 elif isinstance(trio_ped, dict): 9657 log.debug("TRIO pedigree is dict") 9658 9659 # Trio pedigree is not well formatted 9660 else: 9661 msg_error = "TRIO pedigree not well formatted" 9662 log.error(msg_error) 9663 raise ValueError(msg_error) 9664 9665 # Construct trio list 9666 trio_samples = [ 9667 trio_ped.get("father", ""), 9668 trio_ped.get("mother", ""), 9669 trio_ped.get("child", ""), 9670 ] 9671 9672 else: 9673 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9674 samples_list = self.get_header_sample_list() 9675 if len(samples_list) >= 3: 9676 trio_samples = self.get_header_sample_list()[0:3] 9677 trio_ped = { 9678 "father": trio_samples[0], 9679 "mother": trio_samples[1], 9680 "child": trio_samples[2], 9681 } 9682 else: 9683 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9684 log.error(msg_error) 9685 raise ValueError(msg_error) 9686 9687 # Check trio pedigree 9688 if not trio_ped or len(trio_ped) != 3: 9689 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9690 log.error(msg_error) 9691 raise ValueError(msg_error) 9692 9693 # Log 9694 log.info( 9695 f"Calculation 'TRIO' - Samples: " 9696 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9697 ) 9698 9699 # Field 9700 trio_infos = prefix + trio_tag 9701 9702 # Variants table 9703 table_variants = self.get_table_variants() 9704 9705 # Header 9706 vcf_reader = self.get_header() 9707 9708 # Create variant id 9709 variant_id_column = self.get_variant_id_column() 9710 added_columns = [variant_id_column] 9711 9712 # variant_id, FORMAT and samples 9713 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9714 self.get_header_sample_list() 9715 ) 9716 9717 # Create dataframe 9718 dataframe_trio = self.get_query_to_df( 9719 f""" SELECT {samples_fields} FROM {table_variants} """ 9720 ) 9721 9722 # Create trio column 9723 dataframe_trio[trio_infos] = dataframe_trio.apply( 9724 lambda row: trio(row, samples=trio_samples), axis=1 9725 ) 9726 9727 # Add trio to header 9728 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9729 trio_tag, 9730 ".", 9731 "String", 9732 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9733 "howard calculation", 9734 "0", 9735 self.code_type_map.get("String"), 9736 ) 9737 9738 # Update 9739 sql_update = f""" 9740 UPDATE {table_variants} 9741 SET "INFO" = 9742 concat( 9743 CASE 9744 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9745 THEN '' 9746 ELSE concat("INFO", ';') 9747 END, 9748 CASE 9749 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9750 AND dataframe_trio."{trio_infos}" NOT NULL 9751 THEN concat( 9752 '{trio_tag}=', 9753 dataframe_trio."{trio_infos}" 9754 ) 9755 ELSE '' 9756 END 9757 ) 9758 FROM dataframe_trio 9759 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9760 """ 9761 self.conn.execute(sql_update) 9762 9763 # Remove added columns 9764 for added_column in added_columns: 9765 self.drop_column(column=added_column) 9766 9767 # Delete dataframe 9768 del dataframe_trio 9769 gc.collect() 9770 9771 def calculation_vaf_normalization(self) -> None: 9772 """ 9773 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9774 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9775 :return: The function does not return anything. 9776 """ 9777 9778 # if FORMAT and samples 9779 if ( 9780 "FORMAT" in self.get_header_columns_as_list() 9781 and self.get_header_sample_list() 9782 ): 9783 9784 # vaf_normalization annotation field 9785 vaf_normalization_tag = "VAF" 9786 9787 # VCF infos tags 9788 vcf_infos_tags = { 9789 "VAF": "VAF Variant Frequency", 9790 } 9791 9792 # Prefix 9793 prefix = self.get_explode_infos_prefix() 9794 9795 # Variants table 9796 table_variants = self.get_table_variants() 9797 9798 # Header 9799 vcf_reader = self.get_header() 9800 9801 # Do not calculate if VAF already exists 9802 if "VAF" in vcf_reader.formats: 9803 log.debug("VAF already on genotypes") 9804 return 9805 9806 # Create variant id 9807 variant_id_column = self.get_variant_id_column() 9808 added_columns = [variant_id_column] 9809 9810 # variant_id, FORMAT and samples 9811 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9812 f""" "{sample}" """ for sample in self.get_header_sample_list() 9813 ) 9814 9815 # Create dataframe 9816 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9817 log.debug(f"query={query}") 9818 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9819 9820 vaf_normalization_set = [] 9821 9822 # for each sample vaf_normalization 9823 for sample in self.get_header_sample_list(): 9824 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9825 lambda row: vaf_normalization(row, sample=sample), axis=1 9826 ) 9827 vaf_normalization_set.append( 9828 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9829 ) 9830 9831 # Add VAF to FORMAT 9832 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9833 "FORMAT" 9834 ].apply(lambda x: str(x) + ":VAF") 9835 vaf_normalization_set.append( 9836 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9837 ) 9838 9839 # Add vaf_normalization to header 9840 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9841 id=vaf_normalization_tag, 9842 num="1", 9843 type="Float", 9844 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9845 type_code=self.code_type_map.get("Float"), 9846 ) 9847 9848 # Create fields to add in INFO 9849 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9850 9851 # Update 9852 sql_update = f""" 9853 UPDATE {table_variants} 9854 SET {sql_vaf_normalization_set} 9855 FROM dataframe_vaf_normalization 9856 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9857 9858 """ 9859 self.conn.execute(sql_update) 9860 9861 # Remove added columns 9862 for added_column in added_columns: 9863 self.drop_column(column=added_column) 9864 9865 # Delete dataframe 9866 del dataframe_vaf_normalization 9867 gc.collect() 9868 9869 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9870 """ 9871 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9872 field in a VCF file and updates the INFO column of the variants table with the calculated 9873 statistics. 9874 9875 :param info: The `info` parameter is a string that represents the type of information for which 9876 genotype statistics are calculated. It is used to generate various VCF info tags for the 9877 statistics, such as the number of occurrences, the list of values, the minimum value, the 9878 maximum value, the mean, the median, defaults to VAF 9879 :type info: str (optional) 9880 """ 9881 9882 # if FORMAT and samples 9883 if ( 9884 "FORMAT" in self.get_header_columns_as_list() 9885 and self.get_header_sample_list() 9886 ): 9887 9888 # vaf_stats annotation field 9889 vaf_stats_tag = info + "_stats" 9890 9891 # VCF infos tags 9892 vcf_infos_tags = { 9893 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9894 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9895 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9896 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9897 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9898 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9899 info 9900 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9901 } 9902 9903 # Prefix 9904 prefix = self.get_explode_infos_prefix() 9905 9906 # Field 9907 vaf_stats_infos = prefix + vaf_stats_tag 9908 9909 # Variants table 9910 table_variants = self.get_table_variants() 9911 9912 # Header 9913 vcf_reader = self.get_header() 9914 9915 # Create variant id 9916 variant_id_column = self.get_variant_id_column() 9917 added_columns = [variant_id_column] 9918 9919 # variant_id, FORMAT and samples 9920 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9921 self.get_header_sample_list() 9922 ) 9923 9924 # Create dataframe 9925 dataframe_vaf_stats = self.get_query_to_df( 9926 f""" SELECT {samples_fields} FROM {table_variants} """ 9927 ) 9928 9929 # Create vaf_stats column 9930 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9931 lambda row: genotype_stats( 9932 row, samples=self.get_header_sample_list(), info=info 9933 ), 9934 axis=1, 9935 ) 9936 9937 # List of vcf tags 9938 sql_vaf_stats_fields = [] 9939 9940 # Check all VAF stats infos 9941 for stat in vcf_infos_tags: 9942 9943 # Extract stats 9944 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9945 lambda x: dict(x).get(stat, "") 9946 ) 9947 9948 # Add snpeff_hgvs to header 9949 vcf_reader.infos[stat] = vcf.parser._Info( 9950 stat, 9951 ".", 9952 "String", 9953 vcf_infos_tags.get(stat, "genotype statistics"), 9954 "howard calculation", 9955 "0", 9956 self.code_type_map.get("String"), 9957 ) 9958 9959 if len(sql_vaf_stats_fields): 9960 sep = ";" 9961 else: 9962 sep = "" 9963 9964 # Create fields to add in INFO 9965 sql_vaf_stats_fields.append( 9966 f""" 9967 CASE 9968 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9969 THEN concat( 9970 '{sep}{stat}=', 9971 dataframe_vaf_stats."{stat}" 9972 ) 9973 ELSE '' 9974 END 9975 """ 9976 ) 9977 9978 # SQL set for update 9979 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9980 9981 # Update 9982 sql_update = f""" 9983 UPDATE {table_variants} 9984 SET "INFO" = 9985 concat( 9986 CASE 9987 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9988 THEN '' 9989 ELSE concat("INFO", ';') 9990 END, 9991 {sql_vaf_stats_fields_set} 9992 ) 9993 FROM dataframe_vaf_stats 9994 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9995 9996 """ 9997 self.conn.execute(sql_update) 9998 9999 # Remove added columns 10000 for added_column in added_columns: 10001 self.drop_column(column=added_column) 10002 10003 # Delete dataframe 10004 del dataframe_vaf_stats 10005 gc.collect() 10006 10007 def calculation_transcripts_annotation( 10008 self, info_json: str = None, info_format: str = None 10009 ) -> None: 10010 """ 10011 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10012 field to it if transcripts are available. 10013 10014 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10015 is a string parameter that represents the information field to be used in the transcripts JSON. 10016 It is used to specify the JSON format for the transcripts information. If no value is provided 10017 when calling the method, it defaults to " 10018 :type info_json: str 10019 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10020 method is a string parameter that specifies the format of the information field to be used in 10021 the transcripts JSON. It is used to define the format of the information field 10022 :type info_format: str 10023 """ 10024 10025 # Create transcripts table 10026 transcripts_table = self.create_transcript_view() 10027 10028 # Add info field 10029 if transcripts_table: 10030 self.transcript_view_to_variants( 10031 transcripts_table=transcripts_table, 10032 transcripts_info_field_json=info_json, 10033 transcripts_info_field_format=info_format, 10034 ) 10035 else: 10036 log.info("No Transcripts to process. Check param.json file configuration") 10037 10038 def calculation_transcripts_prioritization(self) -> None: 10039 """ 10040 The function `calculation_transcripts_prioritization` creates a transcripts table and 10041 prioritizes transcripts based on certain criteria. 10042 """ 10043 10044 # Create transcripts table 10045 transcripts_table = self.create_transcript_view() 10046 10047 # Add info field 10048 if transcripts_table: 10049 self.transcripts_prioritization(transcripts_table=transcripts_table) 10050 else: 10051 log.info("No Transcripts to process. Check param.json file configuration") 10052 10053 def calculation_transcripts_export(self) -> None: 10054 """ """ 10055 10056 # Create transcripts table 10057 transcripts_table = self.create_transcript_view() 10058 10059 # Add info field 10060 if transcripts_table: 10061 self.transcripts_export(transcripts_table=transcripts_table) 10062 else: 10063 log.info("No Transcripts to process. Check param.json file configuration") 10064 10065 ############### 10066 # Transcripts # 10067 ############### 10068 10069 def transcripts_export( 10070 self, transcripts_table: str = None, param: dict = {} 10071 ) -> bool: 10072 """ """ 10073 10074 log.debug("Start transcripts export...") 10075 10076 # Param 10077 if not param: 10078 param = self.get_param() 10079 10080 # Param export 10081 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10082 10083 # Output file 10084 transcripts_export_output = param_transcript_export.get("output", None) 10085 10086 if not param_transcript_export or not transcripts_export_output: 10087 log.warning(f"No transcriipts export parameters defined!") 10088 return False 10089 10090 # List of transcripts annotations 10091 query_describe = f""" 10092 SELECT column_name 10093 FROM ( 10094 DESCRIBE SELECT * FROM {transcripts_table} 10095 ) 10096 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10097 """ 10098 transcripts_annotations_list = list( 10099 self.get_query_to_df(query=query_describe)["column_name"] 10100 ) 10101 10102 # Create transcripts table for export 10103 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10104 random.choices(string.ascii_uppercase + string.digits, k=10) 10105 ) 10106 query_create_transcripts_table_export = f""" 10107 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10108 """ 10109 self.execute_query(query=query_create_transcripts_table_export) 10110 10111 # Output file format 10112 transcripts_export_output_format = get_file_format( 10113 filename=transcripts_export_output 10114 ) 10115 10116 # Format VCF - construct INFO 10117 if transcripts_export_output_format in ["vcf"]: 10118 10119 # Construct query update INFO and header 10120 query_update_info = [] 10121 for field in transcripts_annotations_list: 10122 10123 # If field not in header 10124 if field not in self.get_header_infos_list(): 10125 10126 # Add PZ Transcript in header 10127 self.get_header().infos[field] = vcf.parser._Info( 10128 field, 10129 ".", 10130 "String", 10131 f"Annotation '{field}' from transcript view", 10132 "unknown", 10133 "unknown", 10134 0, 10135 ) 10136 10137 # Add field as INFO/tag 10138 query_update_info.append( 10139 f""" 10140 CASE 10141 WHEN "{field}" IS NOT NULL 10142 THEN concat('{field}=', "{field}", ';') 10143 ELSE '' 10144 END 10145 """ 10146 ) 10147 10148 # Query param 10149 query_update_info_value = ( 10150 f""" concat('', {", ".join(query_update_info)}) """ 10151 ) 10152 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10153 10154 else: 10155 10156 # Query param 10157 query_update_info_value = f""" NULL """ 10158 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10159 10160 # Update query INFO column 10161 query_update = f""" 10162 UPDATE {transcripts_table_export} 10163 SET INFO = {query_update_info_value} 10164 10165 """ 10166 self.execute_query(query=query_update) 10167 10168 # Export 10169 self.export_output( 10170 output_file=transcripts_export_output, 10171 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10172 ) 10173 10174 # Drop transcripts export table 10175 query_drop_transcripts_table_export = f""" 10176 DROP TABLE {transcripts_table_export} 10177 """ 10178 self.execute_query(query=query_drop_transcripts_table_export) 10179 10180 def transcripts_prioritization( 10181 self, transcripts_table: str = None, param: dict = {} 10182 ) -> bool: 10183 """ 10184 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10185 and updates the variants table with the prioritized information. 10186 10187 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10188 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10189 This parameter is used to identify the table where the transcripts data is stored for the 10190 prioritization process 10191 :type transcripts_table: str 10192 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10193 that contains various configuration settings for the prioritization process of transcripts. It 10194 is used to customize the behavior of the prioritization algorithm and includes settings such as 10195 the prefix for prioritization fields, default profiles, and other 10196 :type param: dict 10197 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10198 transcripts prioritization process is successfully completed, and `False` if there are any 10199 issues or if no profile is defined for transcripts prioritization. 10200 """ 10201 10202 log.debug("Start transcripts prioritization...") 10203 10204 # Param 10205 if not param: 10206 param = self.get_param() 10207 10208 # Variants table 10209 table_variants = self.get_table_variants() 10210 10211 # Transcripts table 10212 if transcripts_table is None: 10213 transcripts_table = self.create_transcript_view( 10214 transcripts_table="transcripts", param=param 10215 ) 10216 if transcripts_table is None: 10217 msg_err = "No Transcripts table availalble" 10218 log.error(msg_err) 10219 raise ValueError(msg_err) 10220 log.debug(f"transcripts_table={transcripts_table}") 10221 10222 # Get transcripts columns 10223 columns_as_list_query = f""" 10224 DESCRIBE {transcripts_table} 10225 """ 10226 columns_as_list = list( 10227 self.get_query_to_df(columns_as_list_query)["column_name"] 10228 ) 10229 10230 # Create INFO if not exists 10231 if "INFO" not in columns_as_list: 10232 query_add_info = f""" 10233 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10234 """ 10235 self.execute_query(query_add_info) 10236 10237 # Prioritization param and Force only PZ Score and Flag 10238 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10239 10240 # PZ profile by default 10241 pz_profile_default = ( 10242 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10243 ) 10244 10245 # Exit if no profile 10246 if pz_profile_default is None: 10247 log.warning("No profile defined for transcripts prioritization") 10248 return False 10249 10250 # PZ fields 10251 pz_param_pzfields = {} 10252 10253 # PZ field transcripts 10254 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10255 10256 # Add PZ Transcript in header 10257 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10258 pz_fields_transcripts, 10259 ".", 10260 "String", 10261 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10262 "unknown", 10263 "unknown", 10264 code_type_map["String"], 10265 ) 10266 10267 # Mandatory fields 10268 pz_mandatory_fields_list = [ 10269 "Score", 10270 "Flag", 10271 "Tags", 10272 "Comment", 10273 "Infos", 10274 "Class", 10275 ] 10276 pz_mandatory_fields = [] 10277 for pz_mandatory_field in pz_mandatory_fields_list: 10278 pz_mandatory_fields.append( 10279 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10280 ) 10281 10282 # PZ fields in param 10283 for pz_field in pz_param.get("pzfields", []): 10284 if pz_field in pz_mandatory_fields_list: 10285 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10286 pz_param.get("pzprefix", "PTZ") + pz_field 10287 ) 10288 else: 10289 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10290 pz_param_pzfields[pz_field] = pz_field_new 10291 10292 # Add PZ Transcript in header 10293 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10294 pz_field_new, 10295 ".", 10296 "String", 10297 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10298 "unknown", 10299 "unknown", 10300 code_type_map["String"], 10301 ) 10302 10303 # PZ fields param 10304 pz_param["pzfields"] = pz_mandatory_fields 10305 10306 # Prioritization 10307 prioritization_result = self.prioritization( 10308 table=transcripts_table, 10309 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10310 ) 10311 if not prioritization_result: 10312 log.warning("Transcripts prioritization not processed") 10313 return False 10314 10315 # PZ fields sql query 10316 query_update_select_list = [] 10317 query_update_concat_list = [] 10318 query_update_order_list = [] 10319 for pz_param_pzfield in set( 10320 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10321 ): 10322 query_update_select_list.append(f" {pz_param_pzfield}, ") 10323 10324 for pz_param_pzfield in pz_param_pzfields: 10325 query_update_concat_list.append( 10326 f""" 10327 , CASE 10328 WHEN {pz_param_pzfield} IS NOT NULL 10329 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10330 ELSE '' 10331 END 10332 """ 10333 ) 10334 10335 # Order by 10336 pz_orders = ( 10337 param.get("transcripts", {}) 10338 .get("prioritization", {}) 10339 .get("prioritization_transcripts_order", {}) 10340 ) 10341 if not pz_orders: 10342 pz_orders = { 10343 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10344 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10345 } 10346 for pz_order in pz_orders: 10347 query_update_order_list.append( 10348 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10349 ) 10350 10351 # Fields to explode 10352 fields_to_explode = ( 10353 list(pz_param_pzfields.keys()) 10354 + pz_mandatory_fields 10355 + list(pz_orders.keys()) 10356 ) 10357 # Remove transcript column as a specific transcript column 10358 if "transcript" in fields_to_explode: 10359 fields_to_explode.remove("transcript") 10360 10361 # Fields intranscripts table 10362 query_transcripts_table = f""" 10363 DESCRIBE SELECT * FROM {transcripts_table} 10364 """ 10365 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10366 10367 # Check fields to explode 10368 for field_to_explode in fields_to_explode: 10369 if field_to_explode not in self.get_header_infos_list() + list( 10370 query_transcripts_table.column_name 10371 ): 10372 msg_err = f"INFO/{field_to_explode} NOT IN header" 10373 log.error(msg_err) 10374 raise ValueError(msg_err) 10375 10376 # Explode fields to explode 10377 self.explode_infos( 10378 table=transcripts_table, 10379 fields=fields_to_explode, 10380 ) 10381 10382 # Transcript preference file 10383 transcripts_preference_file = ( 10384 param.get("transcripts", {}) 10385 .get("prioritization", {}) 10386 .get("prioritization_transcripts", {}) 10387 ) 10388 transcripts_preference_file = full_path(transcripts_preference_file) 10389 10390 # Transcript preference forced 10391 transcript_preference_force = ( 10392 param.get("transcripts", {}) 10393 .get("prioritization", {}) 10394 .get("prioritization_transcripts_force", False) 10395 ) 10396 # Transcript version forced 10397 transcript_version_force = ( 10398 param.get("transcripts", {}) 10399 .get("prioritization", {}) 10400 .get("prioritization_transcripts_version_force", False) 10401 ) 10402 10403 # Transcripts Ranking 10404 if transcripts_preference_file: 10405 10406 # Transcripts file to dataframe 10407 if os.path.exists(transcripts_preference_file): 10408 transcripts_preference_dataframe = transcripts_file_to_df( 10409 transcripts_preference_file 10410 ) 10411 else: 10412 log.error( 10413 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10414 ) 10415 raise ValueError( 10416 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10417 ) 10418 10419 # Order by depending to transcript preference forcing 10420 if transcript_preference_force: 10421 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10422 else: 10423 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10424 10425 # Transcript columns joined depend on version consideration 10426 if transcript_version_force: 10427 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10428 else: 10429 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10430 10431 # Query ranking for update 10432 query_update_ranking = f""" 10433 SELECT 10434 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10435 ROW_NUMBER() OVER ( 10436 PARTITION BY "#CHROM", POS, REF, ALT 10437 ORDER BY {order_by} 10438 ) AS rn 10439 FROM {transcripts_table} 10440 LEFT JOIN 10441 ( 10442 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10443 FROM transcripts_preference_dataframe 10444 ) AS transcripts_preference 10445 ON {transcripts_version_join} 10446 """ 10447 10448 else: 10449 10450 # Query ranking for update 10451 query_update_ranking = f""" 10452 SELECT 10453 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10454 ROW_NUMBER() OVER ( 10455 PARTITION BY "#CHROM", POS, REF, ALT 10456 ORDER BY {" , ".join(query_update_order_list)} 10457 ) AS rn 10458 FROM {transcripts_table} 10459 """ 10460 10461 # Export Transcripts prioritization infos to variants table 10462 query_update = f""" 10463 WITH RankedTranscripts AS ( 10464 {query_update_ranking} 10465 ) 10466 UPDATE {table_variants} 10467 SET 10468 INFO = CONCAT(CASE 10469 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10470 THEN '' 10471 ELSE concat("INFO", ';') 10472 END, 10473 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10474 ) 10475 FROM 10476 RankedTranscripts 10477 WHERE 10478 rn = 1 10479 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10480 AND variants."POS" = RankedTranscripts."POS" 10481 AND variants."REF" = RankedTranscripts."REF" 10482 AND variants."ALT" = RankedTranscripts."ALT" 10483 """ 10484 10485 # log.debug(f"query_update={query_update}") 10486 self.execute_query(query=query_update) 10487 10488 # Return 10489 return True 10490 10491 def create_transcript_view_from_columns_map( 10492 self, 10493 transcripts_table: str = "transcripts", 10494 columns_maps: dict = {}, 10495 added_columns: list = [], 10496 temporary_tables: list = None, 10497 annotation_fields: list = None, 10498 column_rename: dict = {}, 10499 column_clean: bool = False, 10500 column_case: str = None, 10501 ) -> tuple[list, list, list]: 10502 """ 10503 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10504 specified columns mapping for transcripts data. 10505 10506 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10507 of the table where the transcripts data is stored or will be stored in the database. This table 10508 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10509 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10510 :type transcripts_table: str (optional) 10511 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10512 about how to map columns from a transcripts table to create a view. Each entry in the 10513 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10514 typically includes details such as the main transcript column and additional information columns 10515 :type columns_maps: dict 10516 :param added_columns: The `added_columns` parameter in the 10517 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10518 that will be added to the view being created based on the columns map provided. These columns 10519 are generated by exploding the transcript information columns along with the main transcript 10520 column 10521 :type added_columns: list 10522 :param temporary_tables: The `temporary_tables` parameter in the 10523 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10524 tables created during the process of creating a transcript view from a columns map. These 10525 temporary tables are used to store intermediate results or transformations before the final view 10526 is generated 10527 :type temporary_tables: list 10528 :param annotation_fields: The `annotation_fields` parameter in the 10529 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10530 used for annotation in the query view creation process. These fields are extracted from the 10531 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10532 :type annotation_fields: list 10533 :param column_rename: The `column_rename` parameter in the 10534 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10535 custom renaming for columns during the creation of the temporary table view. This parameter 10536 provides a mapping of original column names to the desired renamed column names. By using this 10537 parameter, 10538 :type column_rename: dict 10539 :param column_clean: The `column_clean` parameter in the 10540 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10541 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10542 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10543 False 10544 :type column_clean: bool (optional) 10545 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10546 function is used to specify the case transformation to be applied to the columns during the view 10547 creation process. It allows you to control whether the column values should be converted to 10548 lowercase, uppercase, or remain unchanged 10549 :type column_case: str 10550 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10551 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10552 """ 10553 10554 log.debug("Start transcrpts view creation from columns map...") 10555 10556 # "from_columns_map": [ 10557 # { 10558 # "transcripts_column": "Ensembl_transcriptid", 10559 # "transcripts_infos_columns": [ 10560 # "genename", 10561 # "Ensembl_geneid", 10562 # "LIST_S2_score", 10563 # "LIST_S2_pred", 10564 # ], 10565 # }, 10566 # { 10567 # "transcripts_column": "Ensembl_transcriptid", 10568 # "transcripts_infos_columns": [ 10569 # "genename", 10570 # "VARITY_R_score", 10571 # "Aloft_pred", 10572 # ], 10573 # }, 10574 # ], 10575 10576 # Init 10577 if temporary_tables is None: 10578 temporary_tables = [] 10579 if annotation_fields is None: 10580 annotation_fields = [] 10581 10582 # Variants table 10583 table_variants = self.get_table_variants() 10584 10585 for columns_map in columns_maps: 10586 10587 # Transcript column 10588 transcripts_column = columns_map.get("transcripts_column", None) 10589 10590 # Transcripts infos columns 10591 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10592 10593 # Transcripts infos columns rename 10594 column_rename = columns_map.get("column_rename", column_rename) 10595 10596 # Transcripts infos columns clean 10597 column_clean = columns_map.get("column_clean", column_clean) 10598 10599 # Transcripts infos columns case 10600 column_case = columns_map.get("column_case", column_case) 10601 10602 if transcripts_column is not None: 10603 10604 # Explode 10605 added_columns += self.explode_infos( 10606 fields=[transcripts_column] + transcripts_infos_columns 10607 ) 10608 10609 # View clauses 10610 clause_select_variants = [] 10611 clause_select_tanscripts = [] 10612 for field in [transcripts_column] + transcripts_infos_columns: 10613 10614 # AS field 10615 as_field = field 10616 10617 # Rename 10618 if column_rename: 10619 as_field = column_rename.get(as_field, as_field) 10620 10621 # Clean 10622 if column_clean: 10623 as_field = clean_annotation_field(as_field) 10624 10625 # Case 10626 if column_case: 10627 if column_case.lower() in ["lower"]: 10628 as_field = as_field.lower() 10629 elif column_case.lower() in ["upper"]: 10630 as_field = as_field.upper() 10631 10632 # Clause select Variants 10633 clause_select_variants.append( 10634 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10635 ) 10636 10637 if field in [transcripts_column]: 10638 clause_select_tanscripts.append( 10639 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10640 ) 10641 else: 10642 clause_select_tanscripts.append( 10643 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10644 ) 10645 annotation_fields.append(as_field) 10646 10647 # Querey View 10648 query = f""" 10649 SELECT 10650 "#CHROM", POS, REF, ALT, INFO, 10651 "{transcripts_column}" AS 'transcript', 10652 {", ".join(clause_select_tanscripts)} 10653 FROM ( 10654 SELECT 10655 "#CHROM", POS, REF, ALT, INFO, 10656 {", ".join(clause_select_variants)} 10657 FROM {table_variants} 10658 ) 10659 WHERE "{transcripts_column}" IS NOT NULL 10660 """ 10661 10662 # Create temporary table 10663 temporary_table = transcripts_table + "".join( 10664 random.choices(string.ascii_uppercase + string.digits, k=10) 10665 ) 10666 10667 # Temporary_tables 10668 temporary_tables.append(temporary_table) 10669 query_view = f""" 10670 CREATE TEMPORARY TABLE {temporary_table} 10671 AS ({query}) 10672 """ 10673 self.execute_query(query=query_view) 10674 10675 return added_columns, temporary_tables, annotation_fields 10676 10677 def create_transcript_view_from_column_format( 10678 self, 10679 transcripts_table: str = "transcripts", 10680 column_formats: dict = {}, 10681 temporary_tables: list = None, 10682 annotation_fields: list = None, 10683 column_rename: dict = {}, 10684 column_clean: bool = False, 10685 column_case: str = None, 10686 ) -> tuple[list, list, list]: 10687 """ 10688 The `create_transcript_view_from_column_format` function generates a transcript view based on 10689 specified column formats, adds additional columns and annotation fields, and returns the list of 10690 temporary tables and annotation fields. 10691 10692 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10693 of the table containing the transcripts data. This table will be used as the base table for 10694 creating the transcript view. The default value for this parameter is "transcripts", but you can 10695 provide a different table name if needed, defaults to transcripts 10696 :type transcripts_table: str (optional) 10697 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10698 about the columns to be used for creating the transcript view. Each entry in the dictionary 10699 specifies the mapping between a transcripts column and a transcripts infos column. This 10700 parameter allows you to define how the columns from the transcripts table should be transformed 10701 or mapped 10702 :type column_formats: dict 10703 :param temporary_tables: The `temporary_tables` parameter in the 10704 `create_transcript_view_from_column_format` function is a list that stores the names of 10705 temporary views created during the process of creating a transcript view from a column format. 10706 These temporary views are used to manipulate and extract data before generating the final 10707 transcript view 10708 :type temporary_tables: list 10709 :param annotation_fields: The `annotation_fields` parameter in the 10710 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10711 that are extracted from the temporary views created during the process. These annotation fields 10712 are obtained by querying the temporary views and extracting the column names excluding specific 10713 columns like `#CH 10714 :type annotation_fields: list 10715 :param column_rename: The `column_rename` parameter in the 10716 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10717 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10718 column names to new column names in this dictionary, you can rename specific columns during the 10719 process 10720 :type column_rename: dict 10721 :param column_clean: The `column_clean` parameter in the 10722 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10723 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10724 will be cleaned during the creation of the transcript view based on the specified column format, 10725 defaults to False 10726 :type column_clean: bool (optional) 10727 :param column_case: The `column_case` parameter in the 10728 `create_transcript_view_from_column_format` function is used to specify the case transformation 10729 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10730 to convert the column names to uppercase or lowercase, respectively 10731 :type column_case: str 10732 :return: The `create_transcript_view_from_column_format` function returns two lists: 10733 `temporary_tables` and `annotation_fields`. 10734 """ 10735 10736 log.debug("Start transcrpts view creation from column format...") 10737 10738 # "from_column_format": [ 10739 # { 10740 # "transcripts_column": "ANN", 10741 # "transcripts_infos_column": "Feature_ID", 10742 # } 10743 # ], 10744 10745 # Init 10746 if temporary_tables is None: 10747 temporary_tables = [] 10748 if annotation_fields is None: 10749 annotation_fields = [] 10750 10751 for column_format in column_formats: 10752 10753 # annotation field and transcript annotation field 10754 annotation_field = column_format.get("transcripts_column", "ANN") 10755 transcript_annotation = column_format.get( 10756 "transcripts_infos_column", "Feature_ID" 10757 ) 10758 10759 # Transcripts infos columns rename 10760 column_rename = column_format.get("column_rename", column_rename) 10761 10762 # Transcripts infos columns clean 10763 column_clean = column_format.get("column_clean", column_clean) 10764 10765 # Transcripts infos columns case 10766 column_case = column_format.get("column_case", column_case) 10767 10768 # Temporary View name 10769 temporary_view_name = transcripts_table + "".join( 10770 random.choices(string.ascii_uppercase + string.digits, k=10) 10771 ) 10772 10773 # Create temporary view name 10774 temporary_view_name = self.annotation_format_to_table( 10775 uniquify=True, 10776 annotation_field=annotation_field, 10777 view_name=temporary_view_name, 10778 annotation_id=transcript_annotation, 10779 column_rename=column_rename, 10780 column_clean=column_clean, 10781 column_case=column_case, 10782 ) 10783 10784 # Annotation fields 10785 if temporary_view_name: 10786 query_annotation_fields = f""" 10787 SELECT * 10788 FROM ( 10789 DESCRIBE SELECT * 10790 FROM {temporary_view_name} 10791 ) 10792 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10793 """ 10794 df_annotation_fields = self.get_query_to_df( 10795 query=query_annotation_fields 10796 ) 10797 10798 # Add temporary view and annotation fields 10799 temporary_tables.append(temporary_view_name) 10800 annotation_fields += list(set(df_annotation_fields["column_name"])) 10801 10802 return temporary_tables, annotation_fields 10803 10804 def create_transcript_view( 10805 self, 10806 transcripts_table: str = None, 10807 transcripts_table_drop: bool = False, 10808 param: dict = {}, 10809 ) -> str: 10810 """ 10811 The `create_transcript_view` function generates a transcript view by processing data from a 10812 specified table based on provided parameters and structural information. 10813 10814 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10815 is used to specify the name of the table that will store the final transcript view data. If a table 10816 name is not provided, the function will create a new table to store the transcript view data, and by 10817 default,, defaults to transcripts 10818 :type transcripts_table: str (optional) 10819 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10820 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10821 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10822 the function will drop the existing transcripts table if it exists, defaults to False 10823 :type transcripts_table_drop: bool (optional) 10824 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10825 contains information needed to create a transcript view. It includes details such as the structure 10826 of the transcripts, columns mapping, column formats, and other necessary information for generating 10827 the view. This parameter allows for flexibility and customization 10828 :type param: dict 10829 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10830 created or modified during the execution of the function. 10831 """ 10832 10833 log.debug("Start transcripts view creation...") 10834 10835 # Default 10836 transcripts_table_default = "transcripts" 10837 10838 # Param 10839 if not param: 10840 param = self.get_param() 10841 10842 # Struct 10843 struct = param.get("transcripts", {}).get("struct", None) 10844 10845 # Transcript veresion 10846 transcript_id_remove_version = param.get("transcripts", {}).get( 10847 "transcript_id_remove_version", False 10848 ) 10849 10850 # Transcripts mapping 10851 transcript_id_mapping_file = param.get("transcripts", {}).get( 10852 "transcript_id_mapping_file", None 10853 ) 10854 10855 # Transcripts mapping 10856 transcript_id_mapping_force = param.get("transcripts", {}).get( 10857 "transcript_id_mapping_force", None 10858 ) 10859 10860 if struct: 10861 10862 # Transcripts table 10863 if transcripts_table is None: 10864 transcripts_table = param.get("transcripts", {}).get( 10865 "table", transcripts_table_default 10866 ) 10867 10868 # added_columns 10869 added_columns = [] 10870 10871 # Temporary tables 10872 temporary_tables = [] 10873 10874 # Annotation fields 10875 annotation_fields = [] 10876 10877 # from columns map 10878 columns_maps = struct.get("from_columns_map", []) 10879 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10880 self.create_transcript_view_from_columns_map( 10881 transcripts_table=transcripts_table, 10882 columns_maps=columns_maps, 10883 added_columns=added_columns, 10884 temporary_tables=temporary_tables, 10885 annotation_fields=annotation_fields, 10886 ) 10887 ) 10888 added_columns += added_columns_tmp 10889 temporary_tables += temporary_tables_tmp 10890 annotation_fields += annotation_fields_tmp 10891 10892 # from column format 10893 column_formats = struct.get("from_column_format", []) 10894 temporary_tables_tmp, annotation_fields_tmp = ( 10895 self.create_transcript_view_from_column_format( 10896 transcripts_table=transcripts_table, 10897 column_formats=column_formats, 10898 temporary_tables=temporary_tables, 10899 annotation_fields=annotation_fields, 10900 ) 10901 ) 10902 temporary_tables += temporary_tables_tmp 10903 annotation_fields += annotation_fields_tmp 10904 10905 # Remove some specific fields/column 10906 annotation_fields = list(set(annotation_fields)) 10907 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10908 if field in annotation_fields: 10909 annotation_fields.remove(field) 10910 10911 # Merge temporary tables query 10912 query_merge = "" 10913 for temporary_table in list(set(temporary_tables)): 10914 10915 # First temporary table 10916 if not query_merge: 10917 query_merge = f""" 10918 SELECT * FROM {temporary_table} 10919 """ 10920 # other temporary table (using UNION) 10921 else: 10922 query_merge += f""" 10923 UNION BY NAME SELECT * FROM {temporary_table} 10924 """ 10925 10926 # transcript table tmp 10927 transcript_table_tmp = "transcripts_tmp" 10928 transcript_table_tmp2 = "transcripts_tmp2" 10929 transcript_table_tmp3 = "transcripts_tmp3" 10930 10931 # Merge on transcript 10932 query_merge_on_transcripts_annotation_fields = [] 10933 10934 # Add transcript list 10935 query_merge_on_transcripts_annotation_fields.append( 10936 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10937 ) 10938 10939 # Aggregate all annotations fields 10940 for annotation_field in set(annotation_fields): 10941 query_merge_on_transcripts_annotation_fields.append( 10942 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10943 ) 10944 10945 # Transcripts mapping 10946 if transcript_id_mapping_file: 10947 10948 # Transcript dataframe 10949 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10950 transcript_id_mapping_dataframe = transcripts_file_to_df( 10951 transcript_id_mapping_file, column_names=["transcript", "alias"] 10952 ) 10953 10954 # Transcript version remove 10955 if transcript_id_remove_version: 10956 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10957 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10958 query_left_join = f""" 10959 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10960 """ 10961 else: 10962 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10963 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10964 query_left_join = f""" 10965 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10966 """ 10967 10968 # Transcript column for group by merge 10969 query_transcript_merge_group_by = """ 10970 CASE 10971 WHEN transcript_mapped NOT IN ('') 10972 THEN split_part(transcript_mapped, '.', 1) 10973 ELSE split_part(transcript_original, '.', 1) 10974 END 10975 """ 10976 10977 # Merge query 10978 transcripts_tmp2_query = f""" 10979 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10980 FROM ({query_merge}) AS {transcript_table_tmp} 10981 {query_left_join} 10982 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10983 """ 10984 10985 # Retrive columns after mege 10986 transcripts_tmp2_describe_query = f""" 10987 DESCRIBE {transcripts_tmp2_query} 10988 """ 10989 transcripts_tmp2_describe_list = list( 10990 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10991 "column_name" 10992 ] 10993 ) 10994 10995 # Create list of columns for select clause 10996 transcripts_tmp2_describe_select_clause = [] 10997 for field in transcripts_tmp2_describe_list: 10998 if field not in [ 10999 "#CHROM", 11000 "POS", 11001 "REF", 11002 "ALT", 11003 "INFO", 11004 "transcript_mapped", 11005 ]: 11006 as_field = field 11007 if field in ["transcript_original"]: 11008 as_field = "transcripts_mapped" 11009 transcripts_tmp2_describe_select_clause.append( 11010 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11011 ) 11012 11013 # Merge with mapping 11014 query_merge_on_transcripts = f""" 11015 SELECT 11016 "#CHROM", POS, REF, ALT, INFO, 11017 CASE 11018 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11019 THEN ANY_VALUE(transcript_mapped) 11020 ELSE ANY_VALUE(transcript_original) 11021 END AS transcript, 11022 {", ".join(transcripts_tmp2_describe_select_clause)} 11023 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11024 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11025 {query_transcript_merge_group_by} 11026 """ 11027 11028 # Add transcript filter from mapping file 11029 if transcript_id_mapping_force: 11030 query_merge_on_transcripts = f""" 11031 SELECT * 11032 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11033 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11034 """ 11035 11036 # No transcript mapping 11037 else: 11038 11039 # Remove transcript version 11040 if transcript_id_remove_version: 11041 query_transcript_column = f""" 11042 split_part({transcript_table_tmp}.transcript, '.', 1) 11043 """ 11044 else: 11045 query_transcript_column = """ 11046 transcript 11047 """ 11048 11049 # Query sections 11050 query_transcript_column_select = ( 11051 f"{query_transcript_column} AS transcript" 11052 ) 11053 query_transcript_column_group_by = query_transcript_column 11054 11055 # Query for transcripts view 11056 query_merge_on_transcripts = f""" 11057 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11058 FROM ({query_merge}) AS {transcript_table_tmp} 11059 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11060 """ 11061 11062 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11063 11064 # Drop transcript view is necessary 11065 if transcripts_table_drop: 11066 query_drop = f""" 11067 DROP TABLE IF EXISTS {transcripts_table}; 11068 """ 11069 self.execute_query(query=query_drop) 11070 11071 # Merge and create transcript view 11072 query_create_view = f""" 11073 CREATE TABLE IF NOT EXISTS {transcripts_table} 11074 AS {query_merge_on_transcripts} 11075 """ 11076 self.execute_query(query=query_create_view) 11077 11078 # Remove added columns 11079 for added_column in added_columns: 11080 self.drop_column(column=added_column) 11081 11082 else: 11083 11084 transcripts_table = None 11085 11086 return transcripts_table 11087 11088 def annotation_format_to_table( 11089 self, 11090 uniquify: bool = True, 11091 annotation_field: str = "ANN", 11092 annotation_id: str = "Feature_ID", 11093 view_name: str = "transcripts", 11094 column_rename: dict = {}, 11095 column_clean: bool = False, 11096 column_case: str = None, 11097 ) -> str: 11098 """ 11099 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11100 structured table format, ensuring unique values and creating a temporary table for further 11101 processing or analysis. 11102 11103 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11104 unique values in the output or not. If set to `True`, the function will make sure that the 11105 output values are unique, defaults to True 11106 :type uniquify: bool (optional) 11107 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11108 that contains the annotation information for each variant. This field is used to extract the 11109 annotation details for further processing in the function. By default, it is set to "ANN", 11110 defaults to ANN 11111 :type annotation_field: str (optional) 11112 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11113 is used to specify the identifier for the annotation feature. This identifier will be used as a 11114 column name in the resulting table or view that is created based on the annotation data. It 11115 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11116 :type annotation_id: str (optional) 11117 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11118 to specify the name of the temporary table that will be created to store the transformed 11119 annotation data. This table will hold the extracted information from the annotation field in a 11120 structured format for further processing or analysis. By default,, defaults to transcripts 11121 :type view_name: str (optional) 11122 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11123 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11124 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11125 created based on the annotation data. This feature enables 11126 :type column_rename: dict 11127 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11128 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11129 If set to `True`, the function will clean the annotation field before further processing. This 11130 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11131 to False 11132 :type column_clean: bool (optional) 11133 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11134 used to specify the case transformation to be applied to the column names extracted from the 11135 annotation data. It allows you to set the case of the column names to either lowercase or 11136 uppercase for consistency or other specific requirements during the conversion 11137 :type column_case: str 11138 :return: The function `annotation_format_to_table` is returning the name of the view created, 11139 which is stored in the variable `view_name`. 11140 """ 11141 11142 # Annotation field 11143 annotation_format = "annotation_explode" 11144 11145 # Transcript annotation 11146 if column_rename: 11147 annotation_id = column_rename.get(annotation_id, annotation_id) 11148 11149 if column_clean: 11150 annotation_id = clean_annotation_field(annotation_id) 11151 11152 # Prefix 11153 prefix = self.get_explode_infos_prefix() 11154 if prefix: 11155 prefix = "INFO/" 11156 11157 # Annotation fields 11158 annotation_infos = prefix + annotation_field 11159 annotation_format_infos = prefix + annotation_format 11160 11161 # Variants table 11162 table_variants = self.get_table_variants() 11163 11164 # Header 11165 vcf_reader = self.get_header() 11166 11167 # Add columns 11168 added_columns = [] 11169 11170 # Explode HGVS field in column 11171 added_columns += self.explode_infos(fields=[annotation_field]) 11172 11173 if annotation_field in vcf_reader.infos: 11174 11175 # Extract ANN header 11176 ann_description = vcf_reader.infos[annotation_field].desc 11177 pattern = r"'(.+?)'" 11178 match = re.search(pattern, ann_description) 11179 if match: 11180 ann_header_match = match.group(1).split(" | ") 11181 ann_header = [] 11182 ann_header_desc = {} 11183 for i in range(len(ann_header_match)): 11184 ann_header_info = "".join( 11185 char for char in ann_header_match[i] if char.isalnum() 11186 ) 11187 ann_header.append(ann_header_info) 11188 ann_header_desc[ann_header_info] = ann_header_match[i] 11189 if not ann_header_desc: 11190 raise ValueError("Invalid header description format") 11191 else: 11192 raise ValueError("Invalid header description format") 11193 11194 # Create variant id 11195 variant_id_column = self.get_variant_id_column() 11196 added_columns += [variant_id_column] 11197 11198 # Create dataframe 11199 dataframe_annotation_format = self.get_query_to_df( 11200 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11201 ) 11202 11203 # Create annotation columns 11204 dataframe_annotation_format[ 11205 annotation_format_infos 11206 ] = dataframe_annotation_format[annotation_infos].apply( 11207 lambda x: explode_annotation_format( 11208 annotation=str(x), 11209 uniquify=uniquify, 11210 output_format="JSON", 11211 prefix="", 11212 header=list(ann_header_desc.values()), 11213 ) 11214 ) 11215 11216 # Find keys 11217 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11218 df_keys = self.get_query_to_df(query=query_json) 11219 11220 # Check keys 11221 query_json_key = [] 11222 for _, row in df_keys.iterrows(): 11223 11224 # Key 11225 key = row.iloc[0] 11226 key_clean = key 11227 11228 # key rename 11229 if column_rename: 11230 key_clean = column_rename.get(key_clean, key_clean) 11231 11232 # key clean 11233 if column_clean: 11234 key_clean = clean_annotation_field(key_clean) 11235 11236 # Key case 11237 if column_case: 11238 if column_case.lower() in ["lower"]: 11239 key_clean = key_clean.lower() 11240 elif column_case.lower() in ["upper"]: 11241 key_clean = key_clean.upper() 11242 11243 # Type 11244 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11245 11246 # Get DataFrame from query 11247 df_json_type = self.get_query_to_df(query=query_json_type) 11248 11249 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11250 with pd.option_context("future.no_silent_downcasting", True): 11251 df_json_type.fillna(value="", inplace=True) 11252 replace_dict = {None: np.nan, "": np.nan} 11253 df_json_type.replace(replace_dict, inplace=True) 11254 df_json_type.dropna(inplace=True) 11255 11256 # Detect column type 11257 column_type = detect_column_type(df_json_type[key_clean]) 11258 11259 # Append 11260 query_json_key.append( 11261 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11262 ) 11263 11264 # Create view 11265 query_view = f""" 11266 CREATE TEMPORARY TABLE {view_name} 11267 AS ( 11268 SELECT *, {annotation_id} AS 'transcript' 11269 FROM ( 11270 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11271 FROM dataframe_annotation_format 11272 ) 11273 ); 11274 """ 11275 self.execute_query(query=query_view) 11276 11277 else: 11278 11279 # Return None 11280 view_name = None 11281 11282 # Remove added columns 11283 for added_column in added_columns: 11284 self.drop_column(column=added_column) 11285 11286 return view_name 11287 11288 def transcript_view_to_variants( 11289 self, 11290 transcripts_table: str = None, 11291 transcripts_column_id: str = None, 11292 transcripts_info_json: str = None, 11293 transcripts_info_field_json: str = None, 11294 transcripts_info_format: str = None, 11295 transcripts_info_field_format: str = None, 11296 param: dict = {}, 11297 ) -> bool: 11298 """ 11299 The `transcript_view_to_variants` function updates a variants table with information from 11300 transcripts in JSON format. 11301 11302 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11303 table containing the transcripts data. If this parameter is not provided, the function will 11304 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11305 :type transcripts_table: str 11306 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11307 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11308 identifier is used to match transcripts with variants in the database 11309 :type transcripts_column_id: str 11310 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11311 of the column in the variants table where the transcripts information will be stored in JSON 11312 format. This parameter allows you to define the column in the variants table that will hold the 11313 JSON-formatted information about transcripts 11314 :type transcripts_info_json: str 11315 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11316 specify the field in the VCF header that will contain information about transcripts in JSON 11317 format. This field will be added to the VCF header as an INFO field with the specified name 11318 :type transcripts_info_field_json: str 11319 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11320 format of the information about transcripts that will be stored in the variants table. This 11321 format can be used to define how the transcript information will be structured or displayed 11322 within the variants table 11323 :type transcripts_info_format: str 11324 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11325 specify the field in the VCF header that will contain information about transcripts in a 11326 specific format. This field will be added to the VCF header as an INFO field with the specified 11327 name 11328 :type transcripts_info_field_format: str 11329 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11330 that contains various configuration settings related to transcripts. It is used to provide 11331 default values for certain parameters if they are not explicitly provided when calling the 11332 method. The `param` dictionary can be passed as an argument 11333 :type param: dict 11334 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11335 if the operation is successful and `False` if certain conditions are not met. 11336 """ 11337 11338 msg_info_prefix = "Start transcripts view to variants annotations" 11339 11340 log.debug(f"{msg_info_prefix}...") 11341 11342 # Default 11343 transcripts_table_default = "transcripts" 11344 transcripts_column_id_default = "transcript" 11345 transcripts_info_json_default = None 11346 transcripts_info_format_default = None 11347 transcripts_info_field_json_default = None 11348 transcripts_info_field_format_default = None 11349 11350 # Param 11351 if not param: 11352 param = self.get_param() 11353 11354 # Transcripts table 11355 if transcripts_table is None: 11356 transcripts_table = param.get("transcripts", {}).get( 11357 "table", transcripts_table_default 11358 ) 11359 11360 # Transcripts column ID 11361 if transcripts_column_id is None: 11362 transcripts_column_id = param.get("transcripts", {}).get( 11363 "column_id", transcripts_column_id_default 11364 ) 11365 11366 # Transcripts info json 11367 if transcripts_info_json is None: 11368 transcripts_info_json = param.get("transcripts", {}).get( 11369 "transcripts_info_json", transcripts_info_json_default 11370 ) 11371 11372 # Transcripts info field JSON 11373 if transcripts_info_field_json is None: 11374 transcripts_info_field_json = param.get("transcripts", {}).get( 11375 "transcripts_info_field_json", transcripts_info_field_json_default 11376 ) 11377 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11378 # transcripts_info_json = transcripts_info_field_json 11379 11380 # Transcripts info format 11381 if transcripts_info_format is None: 11382 transcripts_info_format = param.get("transcripts", {}).get( 11383 "transcripts_info_format", transcripts_info_format_default 11384 ) 11385 11386 # Transcripts info field FORMAT 11387 if transcripts_info_field_format is None: 11388 transcripts_info_field_format = param.get("transcripts", {}).get( 11389 "transcripts_info_field_format", transcripts_info_field_format_default 11390 ) 11391 # if ( 11392 # transcripts_info_field_format is not None 11393 # and transcripts_info_format is None 11394 # ): 11395 # transcripts_info_format = transcripts_info_field_format 11396 11397 # Variants table 11398 table_variants = self.get_table_variants() 11399 11400 # Check info columns param 11401 if ( 11402 transcripts_info_json is None 11403 and transcripts_info_field_json is None 11404 and transcripts_info_format is None 11405 and transcripts_info_field_format is None 11406 ): 11407 return False 11408 11409 # Transcripts infos columns 11410 query_transcripts_infos_columns = f""" 11411 SELECT * 11412 FROM ( 11413 DESCRIBE SELECT * FROM {transcripts_table} 11414 ) 11415 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11416 """ 11417 transcripts_infos_columns = list( 11418 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11419 ) 11420 11421 # View results 11422 clause_select = [] 11423 clause_to_json = [] 11424 clause_to_format = [] 11425 for field in transcripts_infos_columns: 11426 # Do not consider INFO field for export into fields 11427 if field not in ["INFO"]: 11428 clause_select.append( 11429 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11430 ) 11431 clause_to_json.append(f""" '{field}': "{field}" """) 11432 clause_to_format.append(f""" "{field}" """) 11433 11434 # Update 11435 update_set_json = [] 11436 update_set_format = [] 11437 11438 # VCF header 11439 vcf_reader = self.get_header() 11440 11441 # Transcripts to info column in JSON 11442 if transcripts_info_json: 11443 11444 # Create column on variants table 11445 self.add_column( 11446 table_name=table_variants, 11447 column_name=transcripts_info_json, 11448 column_type="JSON", 11449 default_value=None, 11450 drop=False, 11451 ) 11452 11453 # Add header 11454 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11455 transcripts_info_json, 11456 ".", 11457 "String", 11458 "Transcripts in JSON format", 11459 "unknwon", 11460 "unknwon", 11461 self.code_type_map["String"], 11462 ) 11463 11464 # Add to update 11465 update_set_json.append( 11466 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11467 ) 11468 11469 # Transcripts to info field in JSON 11470 if transcripts_info_field_json: 11471 11472 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11473 11474 # Add to update 11475 update_set_json.append( 11476 f""" 11477 INFO = concat( 11478 CASE 11479 WHEN INFO NOT IN ('', '.') 11480 THEN INFO 11481 ELSE '' 11482 END, 11483 CASE 11484 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11485 THEN concat( 11486 ';{transcripts_info_field_json}=', 11487 t.{transcripts_info_json} 11488 ) 11489 ELSE '' 11490 END 11491 ) 11492 """ 11493 ) 11494 11495 # Add header 11496 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11497 transcripts_info_field_json, 11498 ".", 11499 "String", 11500 "Transcripts in JSON format", 11501 "unknwon", 11502 "unknwon", 11503 self.code_type_map["String"], 11504 ) 11505 11506 if update_set_json: 11507 11508 # Update query 11509 query_update = f""" 11510 UPDATE {table_variants} 11511 SET {", ".join(update_set_json)} 11512 FROM 11513 ( 11514 SELECT 11515 "#CHROM", POS, REF, ALT, 11516 concat( 11517 '{{', 11518 string_agg( 11519 '"' || "{transcripts_column_id}" || '":' || 11520 to_json(json_output) 11521 ), 11522 '}}' 11523 )::JSON AS {transcripts_info_json} 11524 FROM 11525 ( 11526 SELECT 11527 "#CHROM", POS, REF, ALT, 11528 "{transcripts_column_id}", 11529 to_json( 11530 {{{",".join(clause_to_json)}}} 11531 )::JSON AS json_output 11532 FROM 11533 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11534 WHERE "{transcripts_column_id}" IS NOT NULL 11535 ) 11536 GROUP BY "#CHROM", POS, REF, ALT 11537 ) AS t 11538 WHERE {table_variants}."#CHROM" = t."#CHROM" 11539 AND {table_variants}."POS" = t."POS" 11540 AND {table_variants}."REF" = t."REF" 11541 AND {table_variants}."ALT" = t."ALT" 11542 """ 11543 11544 self.execute_query(query=query_update) 11545 11546 # Transcripts to info column in FORMAT 11547 if transcripts_info_format: 11548 11549 # Create column on variants table 11550 self.add_column( 11551 table_name=table_variants, 11552 column_name=transcripts_info_format, 11553 column_type="VARCHAR", 11554 default_value=None, 11555 drop=False, 11556 ) 11557 11558 # Add header 11559 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11560 transcripts_info_format, 11561 ".", 11562 "String", 11563 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11564 "unknwon", 11565 "unknwon", 11566 self.code_type_map["String"], 11567 ) 11568 11569 # Add to update 11570 update_set_format.append( 11571 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11572 ) 11573 11574 else: 11575 11576 # Set variable for internal queries 11577 transcripts_info_format = "transcripts_info_format" 11578 11579 # Transcripts to info field in JSON 11580 if transcripts_info_field_format: 11581 11582 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11583 11584 # Add to update 11585 update_set_format.append( 11586 f""" 11587 INFO = concat( 11588 CASE 11589 WHEN INFO NOT IN ('', '.') 11590 THEN INFO 11591 ELSE '' 11592 END, 11593 CASE 11594 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11595 THEN concat( 11596 ';{transcripts_info_field_format}=', 11597 t.{transcripts_info_format} 11598 ) 11599 ELSE '' 11600 END 11601 ) 11602 """ 11603 ) 11604 11605 # Add header 11606 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11607 transcripts_info_field_format, 11608 ".", 11609 "String", 11610 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11611 "unknwon", 11612 "unknwon", 11613 self.code_type_map["String"], 11614 ) 11615 11616 if update_set_format: 11617 11618 # Update query 11619 query_update = f""" 11620 UPDATE {table_variants} 11621 SET {", ".join(update_set_format)} 11622 FROM 11623 ( 11624 SELECT 11625 "#CHROM", POS, REF, ALT, 11626 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11627 FROM 11628 ( 11629 SELECT 11630 "#CHROM", POS, REF, ALT, 11631 "{transcripts_column_id}", 11632 concat( 11633 "{transcripts_column_id}", 11634 '|', 11635 {", '|', ".join(clause_to_format)} 11636 ) AS {transcripts_info_format} 11637 FROM 11638 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11639 ) 11640 GROUP BY "#CHROM", POS, REF, ALT 11641 ) AS t 11642 WHERE {table_variants}."#CHROM" = t."#CHROM" 11643 AND {table_variants}."POS" = t."POS" 11644 AND {table_variants}."REF" = t."REF" 11645 AND {table_variants}."ALT" = t."ALT" 11646 """ 11647 11648 self.execute_query(query=query_update) 11649 11650 return True 11651 11652 def rename_info_fields( 11653 self, fields_to_rename: dict = None, table: str = None 11654 ) -> dict: 11655 """ 11656 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11657 corresponding INFO fields in the variants table. 11658 11659 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11660 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11661 represent the original field names that need to be renamed, and the corresponding values 11662 represent the new names to which the fields should be 11663 :type fields_to_rename: dict 11664 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11665 the table in which the variants data is stored. This table contains information about genetic 11666 variants, and the function updates the corresponding INFO fields in this table when renaming 11667 specified fields in the VCF file header 11668 :type table: str 11669 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11670 the original field names as keys and their corresponding new names (or None if the field was 11671 removed) as values after renaming or removing specified fields in a VCF file header and updating 11672 corresponding INFO fields in the variants table. 11673 """ 11674 11675 # Init 11676 fields_renamed = {} 11677 config = self.get_config() 11678 access = config.get("access") 11679 11680 if table is None: 11681 table = self.get_table_variants() 11682 11683 # regexp replace fonction 11684 regex_replace_dict = {} 11685 regex_replace_nb = 0 11686 regex_replace_partition = 125 11687 regex_replace = "INFO" 11688 11689 if fields_to_rename is not None and access not in ["RO"]: 11690 11691 log.info("Rename or remove fields...") 11692 11693 # Header 11694 header = self.get_header() 11695 11696 for field_to_rename, field_renamed in fields_to_rename.items(): 11697 11698 if field_to_rename in header.infos: 11699 11700 # Rename header 11701 if field_renamed is not None: 11702 header.infos[field_renamed] = vcf.parser._Info( 11703 field_renamed, 11704 header.infos[field_to_rename].num, 11705 header.infos[field_to_rename].type, 11706 header.infos[field_to_rename].desc, 11707 header.infos[field_to_rename].source, 11708 header.infos[field_to_rename].version, 11709 header.infos[field_to_rename].type_code, 11710 ) 11711 del header.infos[field_to_rename] 11712 11713 # Rename INFO patterns 11714 field_pattern = rf'(^|;)({field_to_rename})=([^;]*)' 11715 if field_renamed is not None: 11716 field_renamed_pattern = rf'\1{field_renamed}=\3' 11717 else: 11718 field_renamed_pattern = '' 11719 11720 # regexp replace 11721 regex_replace_nb += 1 11722 regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition) 11723 if (regex_replace_nb % regex_replace_partition) == 0: 11724 regex_replace = "INFO" 11725 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11726 regex_replace_dict[regex_replace_key] = regex_replace 11727 11728 # Return 11729 fields_renamed[field_to_rename] = field_renamed 11730 11731 # Log 11732 if field_renamed is not None: 11733 log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'") 11734 else: 11735 log.info(f"Rename or remove fields - field '{field_to_rename}' removed") 11736 11737 # Rename INFO 11738 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11739 log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...") 11740 query = f""" 11741 UPDATE {table} 11742 SET 11743 INFO = {regex_replace} 11744 """ 11745 log.debug(f"query={query}") 11746 self.execute_query(query=query) 11747 11748 return fields_renamed 11749 11750 def calculation_rename_info_fields( 11751 self, 11752 fields_to_rename: dict = None, 11753 table: str = None, 11754 operation_name: str = "RENAME_INFO_FIELDS", 11755 ) -> None: 11756 """ 11757 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11758 fields to rename and table if provided, and then calls another function to rename the fields. 11759 11760 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11761 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11762 the key and the new field name as the value 11763 :type fields_to_rename: dict 11764 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11765 specify the name of the table for which the fields are to be renamed. It is a string type 11766 parameter 11767 :type table: str 11768 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11769 method is a string that specifies the name of the operation being performed. In this context, it 11770 is used as a default value for the operation name if not explicitly provided when calling the 11771 function, defaults to RENAME_INFO_FIELDS 11772 :type operation_name: str (optional) 11773 """ 11774 11775 # Param 11776 param = self.get_param() 11777 11778 # Get param fields to rename 11779 param_fields_to_rename = ( 11780 param.get("calculation", {}) 11781 .get("calculations", {}) 11782 .get(operation_name, {}) 11783 .get("fields_to_rename", None) 11784 ) 11785 11786 # Get param table 11787 param_table = ( 11788 param.get("calculation", {}) 11789 .get("calculations", {}) 11790 .get(operation_name, {}) 11791 .get("table", None) 11792 ) 11793 11794 # Init fields_to_rename 11795 if fields_to_rename is None: 11796 fields_to_rename = param_fields_to_rename 11797 11798 # Init table 11799 if table is None: 11800 table = param_table 11801 11802 renamed_fields = self.rename_info_fields( 11803 fields_to_rename=fields_to_rename, table=table 11804 ) 11805 11806 log.debug(f"renamed_fields:{renamed_fields}")
39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None
The function prints the input, output, config, and dataframe of the current object
571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config
It returns the config
Returns
The config variable is being returned.
994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param
It returns the param
Returns
The param variable is being returned.
1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn
It returns the connection object
Returns
The connection object.
1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list
This function retrieves a list of information fields from the header.
Returns
A list of information fields from the header.
1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(field) 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 )
The export_output function exports data from a VCF file to various formats, including VCF,
CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
partitioning.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True - query: The
queryparameter in theexport_outputfunction is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage - threads: The
threadsparameter in theexport_outputfunction specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads - sort: The
sortparameter in theexport_outputfunction is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. Ifsortis set toTrue, the output file will be sorted. Ifsortis set toFalse,, defaults to False - index: The
indexparameter in theexport_outputfunction is a boolean flag that determines whether an index should be created on the output file. Ifindexis set toTrue, an index will be created on the output file. Ifindexis set toFalse, no, defaults to False - order_by: The
order_byparameter in theexport_outputfunction is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be - fields_to_rename: The
fields_to_renameparameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns
The
export_outputfunction returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2519 else: 2520 samples_fields = "" 2521 log.debug(f"samples_fields: {samples_fields}") 2522 else: 2523 samples_fields = "" 2524 2525 # Where clause 2526 if where_clause is None: 2527 where_clause = "" 2528 2529 # Variants 2530 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2531 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2532 log.debug(f"sql_query_select={sql_query_select}") 2533 2534 return self.export_output( 2535 output_file=vcf_file, 2536 output_header=None, 2537 export_header=True, 2538 query=sql_query_select, 2539 parquet_partitions=None, 2540 chunk_size=config.get("chunk_size", None), 2541 threads=threads, 2542 sort=True, 2543 index=index, 2544 order_by=None, 2545 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2547 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2548 """ 2549 It takes a list of commands and runs them in parallel using the number of threads specified 2550 2551 :param commands: A list of commands to run 2552 :param threads: The number of threads to use, defaults to 1 (optional) 2553 """ 2554 2555 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2557 def get_threads(self, default: int = 1) -> int: 2558 """ 2559 This function returns the number of threads to use for a job, with a default value of 1 if not 2560 specified. 2561 2562 :param default: The `default` parameter in the `get_threads` method is used to specify the 2563 default number of threads to use if no specific value is provided. If no value is provided for 2564 the `threads` parameter in the configuration or input parameters, the `default` value will be 2565 used, defaults to 1 2566 :type default: int (optional) 2567 :return: the number of threads to use for the current job. 2568 """ 2569 2570 # Config 2571 config = self.get_config() 2572 2573 # Param 2574 param = self.get_param() 2575 2576 # Input threads 2577 input_thread = param.get("threads", config.get("threads", None)) 2578 2579 # Check threads 2580 if not input_thread: 2581 threads = default 2582 elif int(input_thread) <= 0: 2583 threads = os.cpu_count() 2584 else: 2585 threads = int(input_thread) 2586 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2588 def get_memory(self, default: str = None) -> str: 2589 """ 2590 This function retrieves the memory value from parameters or configuration with a default value 2591 if not found. 2592 2593 :param default: The `get_memory` function takes in a default value as a string parameter. This 2594 default value is used as a fallback in case the `memory` parameter is not provided in the 2595 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2596 the function 2597 :type default: str 2598 :return: The `get_memory` function returns a string value representing the memory parameter. If 2599 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2600 return the default value provided as an argument to the function. 2601 """ 2602 2603 # Config 2604 config = self.get_config() 2605 2606 # Param 2607 param = self.get_param() 2608 2609 # Input threads 2610 input_memory = param.get("memory", config.get("memory", None)) 2611 2612 # Check threads 2613 if input_memory: 2614 memory = input_memory 2615 else: 2616 memory = default 2617 2618 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2620 def update_from_vcf(self, vcf_file: str) -> None: 2621 """ 2622 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2623 2624 :param vcf_file: the path to the VCF file 2625 """ 2626 2627 connexion_format = self.get_connexion_format() 2628 2629 if connexion_format in ["duckdb"]: 2630 self.update_from_vcf_duckdb(vcf_file) 2631 elif connexion_format in ["sqlite"]: 2632 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2634 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2635 """ 2636 It takes a VCF file and updates the INFO column of the variants table in the database with the 2637 INFO column of the VCF file 2638 2639 :param vcf_file: the path to the VCF file 2640 """ 2641 2642 # varaints table 2643 table_variants = self.get_table_variants() 2644 2645 # Loading VCF into temporaire table 2646 skip = self.get_header_length(file=vcf_file) 2647 vcf_df = pd.read_csv( 2648 vcf_file, 2649 sep="\t", 2650 engine="c", 2651 skiprows=skip, 2652 header=0, 2653 low_memory=False, 2654 ) 2655 sql_query_update = f""" 2656 UPDATE {table_variants} as table_variants 2657 SET INFO = concat( 2658 CASE 2659 WHEN INFO NOT IN ('', '.') 2660 THEN INFO 2661 ELSE '' 2662 END, 2663 ( 2664 SELECT 2665 concat( 2666 CASE 2667 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2668 THEN ';' 2669 ELSE '' 2670 END 2671 , 2672 CASE 2673 WHEN table_parquet.INFO NOT IN ('','.') 2674 THEN table_parquet.INFO 2675 ELSE '' 2676 END 2677 ) 2678 FROM vcf_df as table_parquet 2679 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2680 AND table_parquet.\"POS\" = table_variants.\"POS\" 2681 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2682 AND table_parquet.\"REF\" = table_variants.\"REF\" 2683 AND table_parquet.INFO NOT IN ('','.') 2684 ) 2685 ) 2686 ; 2687 """ 2688 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2690 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2691 """ 2692 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2693 table, then updates the INFO column of the variants table with the INFO column of the temporary 2694 table 2695 2696 :param vcf_file: The path to the VCF file you want to update the database with 2697 """ 2698 2699 # Create a temporary table for the VCF 2700 table_vcf = "tmp_vcf" 2701 sql_create = ( 2702 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2703 ) 2704 self.conn.execute(sql_create) 2705 2706 # Loading VCF into temporaire table 2707 vcf_df = pd.read_csv( 2708 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2709 ) 2710 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2711 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2712 2713 # Update table 'variants' with VCF data 2714 # warning: CONCAT as || operator 2715 sql_query_update = f""" 2716 UPDATE variants as table_variants 2717 SET INFO = CASE 2718 WHEN INFO NOT IN ('', '.') 2719 THEN INFO 2720 ELSE '' 2721 END || 2722 ( 2723 SELECT 2724 CASE 2725 WHEN table_variants.INFO NOT IN ('','.') 2726 AND table_vcf.INFO NOT IN ('','.') 2727 THEN ';' 2728 ELSE '' 2729 END || 2730 CASE 2731 WHEN table_vcf.INFO NOT IN ('','.') 2732 THEN table_vcf.INFO 2733 ELSE '' 2734 END 2735 FROM {table_vcf} as table_vcf 2736 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2737 AND table_vcf.\"POS\" = table_variants.\"POS\" 2738 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2739 AND table_vcf.\"REF\" = table_variants.\"REF\" 2740 ) 2741 """ 2742 self.conn.execute(sql_query_update) 2743 2744 # Drop temporary table 2745 sql_drop = f"DROP TABLE {table_vcf}" 2746 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2748 def drop_variants_table(self) -> None: 2749 """ 2750 > This function drops the variants table 2751 """ 2752 2753 table_variants = self.get_table_variants() 2754 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2755 self.conn.execute(sql_table_variants)
This function drops the variants table
2757 def set_variant_id( 2758 self, variant_id_column: str = "variant_id", force: bool = None 2759 ) -> str: 2760 """ 2761 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2762 `#CHROM`, `POS`, `REF`, and `ALT` columns 2763 2764 :param variant_id_column: The name of the column to be created in the variants table, defaults 2765 to variant_id 2766 :type variant_id_column: str (optional) 2767 :param force: If True, the variant_id column will be created even if it already exists 2768 :type force: bool 2769 :return: The name of the column that contains the variant_id 2770 """ 2771 2772 # Assembly 2773 assembly = self.get_param().get( 2774 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2775 ) 2776 2777 # INFO/Tag prefix 2778 prefix = self.get_explode_infos_prefix() 2779 2780 # Explode INFO/SVTYPE 2781 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2782 2783 # variants table 2784 table_variants = self.get_table_variants() 2785 2786 # variant_id column 2787 if not variant_id_column: 2788 variant_id_column = "variant_id" 2789 2790 # Creta variant_id column 2791 if "variant_id" not in self.get_extra_infos() or force: 2792 2793 # Create column 2794 self.add_column( 2795 table_name=table_variants, 2796 column_name=variant_id_column, 2797 column_type="UBIGINT", 2798 default_value="0", 2799 ) 2800 2801 # Update column 2802 self.conn.execute( 2803 f""" 2804 UPDATE {table_variants} 2805 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2806 """ 2807 ) 2808 2809 # Remove added columns 2810 for added_column in added_columns: 2811 self.drop_column(column=added_column) 2812 2813 # return variant_id column name 2814 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2816 def get_variant_id_column( 2817 self, variant_id_column: str = "variant_id", force: bool = None 2818 ) -> str: 2819 """ 2820 This function returns the variant_id column name 2821 2822 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2823 defaults to variant_id 2824 :type variant_id_column: str (optional) 2825 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2826 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2827 if it is not already set, or if it is set 2828 :type force: bool 2829 :return: The variant_id column name. 2830 """ 2831 2832 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2838 def scan_databases( 2839 self, 2840 database_formats: list = ["parquet"], 2841 database_releases: list = ["current"], 2842 ) -> dict: 2843 """ 2844 The function `scan_databases` scans for available databases based on specified formats and 2845 releases. 2846 2847 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2848 of the databases to be scanned. In this case, the accepted format is "parquet" 2849 :type database_formats: list ["parquet"] 2850 :param database_releases: The `database_releases` parameter is a list that specifies the 2851 releases of the databases to be scanned. In the provided function, the default value for 2852 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2853 databases that are in the "current" 2854 :type database_releases: list 2855 :return: The function `scan_databases` returns a dictionary containing information about 2856 databases that match the specified formats and releases. 2857 """ 2858 2859 # Config 2860 config = self.get_config() 2861 2862 # Param 2863 param = self.get_param() 2864 2865 # Param - Assembly 2866 assembly = param.get("assembly", config.get("assembly", None)) 2867 if not assembly: 2868 assembly = DEFAULT_ASSEMBLY 2869 log.warning(f"Default assembly '{assembly}'") 2870 2871 # Scan for availabled databases 2872 log.info( 2873 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2874 ) 2875 databases_infos_dict = databases_infos( 2876 database_folder_releases=database_releases, 2877 database_formats=database_formats, 2878 assembly=assembly, 2879 config=config, 2880 ) 2881 log.info( 2882 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2883 ) 2884 2885 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2887 def annotation(self) -> None: 2888 """ 2889 It annotates the VCF file with the annotations specified in the config file. 2890 """ 2891 2892 # Config 2893 config = self.get_config() 2894 2895 # Param 2896 param = self.get_param() 2897 2898 # Param - Assembly 2899 assembly = param.get("assembly", config.get("assembly", None)) 2900 if not assembly: 2901 assembly = DEFAULT_ASSEMBLY 2902 log.warning(f"Default assembly '{assembly}'") 2903 2904 # annotations databases folders 2905 annotations_databases = set( 2906 config.get("folders", {}) 2907 .get("databases", {}) 2908 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2909 + config.get("folders", {}) 2910 .get("databases", {}) 2911 .get("parquet", ["~/howard/databases/parquet/current"]) 2912 + config.get("folders", {}) 2913 .get("databases", {}) 2914 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2915 ) 2916 2917 # Get param annotations 2918 if param.get("annotations", None) and isinstance( 2919 param.get("annotations", None), str 2920 ): 2921 log.debug(param.get("annotations", None)) 2922 param_annotation_list = param.get("annotations").split(",") 2923 else: 2924 param_annotation_list = [] 2925 2926 # Each tools param 2927 if param.get("annotation_parquet", None) != None: 2928 log.debug( 2929 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2930 ) 2931 if isinstance(param.get("annotation_parquet", None), list): 2932 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2933 else: 2934 param_annotation_list.append(param.get("annotation_parquet")) 2935 if param.get("annotation_snpsift", None) != None: 2936 if isinstance(param.get("annotation_snpsift", None), list): 2937 param_annotation_list.append( 2938 "snpsift:" 2939 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2940 ) 2941 else: 2942 param_annotation_list.append( 2943 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2944 ) 2945 if param.get("annotation_snpeff", None) != None: 2946 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2947 if param.get("annotation_bcftools", None) != None: 2948 if isinstance(param.get("annotation_bcftools", None), list): 2949 param_annotation_list.append( 2950 "bcftools:" 2951 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2952 ) 2953 else: 2954 param_annotation_list.append( 2955 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2956 ) 2957 if param.get("annotation_annovar", None) != None: 2958 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2959 if param.get("annotation_exomiser", None) != None: 2960 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2961 if param.get("annotation_splice", None) != None: 2962 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2963 2964 # Merge param annotations list 2965 param["annotations"] = ",".join(param_annotation_list) 2966 2967 # debug 2968 log.debug(f"param_annotations={param['annotations']}") 2969 2970 if param.get("annotations"): 2971 2972 # Log 2973 # log.info("Annotations - Check annotation parameters") 2974 2975 if not "annotation" in param: 2976 param["annotation"] = {} 2977 2978 # List of annotations parameters 2979 annotations_list_input = {} 2980 if isinstance(param.get("annotations", None), str): 2981 annotation_file_list = [ 2982 value for value in param.get("annotations", "").split(",") 2983 ] 2984 for annotation_file in annotation_file_list: 2985 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2986 else: 2987 annotations_list_input = param.get("annotations", {}) 2988 2989 log.info(f"Quick Annotations:") 2990 for annotation_key in list(annotations_list_input.keys()): 2991 log.info(f" {annotation_key}") 2992 2993 # List of annotations and associated fields 2994 annotations_list = {} 2995 2996 for annotation_file in annotations_list_input: 2997 2998 # Explode annotations if ALL 2999 if ( 3000 annotation_file.upper() == "ALL" 3001 or annotation_file.upper().startswith("ALL:") 3002 ): 3003 3004 # check ALL parameters (formats, releases) 3005 annotation_file_split = annotation_file.split(":") 3006 database_formats = "parquet" 3007 database_releases = "current" 3008 for annotation_file_option in annotation_file_split[1:]: 3009 database_all_options_split = annotation_file_option.split("=") 3010 if database_all_options_split[0] == "format": 3011 database_formats = database_all_options_split[1].split("+") 3012 if database_all_options_split[0] == "release": 3013 database_releases = database_all_options_split[1].split("+") 3014 3015 # Scan for availabled databases 3016 databases_infos_dict = self.scan_databases( 3017 database_formats=database_formats, 3018 database_releases=database_releases, 3019 ) 3020 3021 # Add found databases in annotation parameters 3022 for database_infos in databases_infos_dict.keys(): 3023 annotations_list[database_infos] = {"INFO": None} 3024 3025 else: 3026 annotations_list[annotation_file] = annotations_list_input[ 3027 annotation_file 3028 ] 3029 3030 # Check each databases 3031 if len(annotations_list): 3032 3033 log.info( 3034 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3035 ) 3036 3037 for annotation_file in annotations_list: 3038 3039 # Init 3040 annotations = annotations_list.get(annotation_file, None) 3041 3042 # Annotation snpEff 3043 if annotation_file.startswith("snpeff"): 3044 3045 log.debug(f"Quick Annotation snpEff") 3046 3047 if "snpeff" not in param["annotation"]: 3048 param["annotation"]["snpeff"] = {} 3049 3050 if "options" not in param["annotation"]["snpeff"]: 3051 param["annotation"]["snpeff"]["options"] = "" 3052 3053 # snpEff options in annotations 3054 param["annotation"]["snpeff"]["options"] = "".join( 3055 annotation_file.split(":")[1:] 3056 ) 3057 3058 # Annotation Annovar 3059 elif annotation_file.startswith("annovar"): 3060 3061 log.debug(f"Quick Annotation Annovar") 3062 3063 if "annovar" not in param["annotation"]: 3064 param["annotation"]["annovar"] = {} 3065 3066 if "annotations" not in param["annotation"]["annovar"]: 3067 param["annotation"]["annovar"]["annotations"] = {} 3068 3069 # Options 3070 annotation_file_split = annotation_file.split(":") 3071 for annotation_file_annotation in annotation_file_split[1:]: 3072 if annotation_file_annotation: 3073 param["annotation"]["annovar"]["annotations"][ 3074 annotation_file_annotation 3075 ] = annotations 3076 3077 # Annotation Exomiser 3078 elif annotation_file.startswith("exomiser"): 3079 3080 log.debug(f"Quick Annotation Exomiser") 3081 3082 param["annotation"]["exomiser"] = params_string_to_dict( 3083 annotation_file 3084 ) 3085 3086 # Annotation Splice 3087 elif annotation_file.startswith("splice"): 3088 3089 log.debug(f"Quick Annotation Splice") 3090 3091 param["annotation"]["splice"] = params_string_to_dict( 3092 annotation_file 3093 ) 3094 3095 # Annotation Parquet or BCFTOOLS 3096 else: 3097 3098 # Tools detection 3099 if annotation_file.startswith("bcftools:"): 3100 annotation_tool_initial = "bcftools" 3101 annotation_file = ":".join(annotation_file.split(":")[1:]) 3102 elif annotation_file.startswith("snpsift:"): 3103 annotation_tool_initial = "snpsift" 3104 annotation_file = ":".join(annotation_file.split(":")[1:]) 3105 elif annotation_file.startswith("bigwig:"): 3106 annotation_tool_initial = "bigwig" 3107 annotation_file = ":".join(annotation_file.split(":")[1:]) 3108 else: 3109 annotation_tool_initial = None 3110 3111 # list of files 3112 annotation_file_list = annotation_file.replace("+", ":").split( 3113 ":" 3114 ) 3115 3116 for annotation_file in annotation_file_list: 3117 3118 if annotation_file: 3119 3120 # Annotation tool initial 3121 annotation_tool = annotation_tool_initial 3122 3123 # Find file 3124 annotation_file_found = None 3125 3126 if os.path.exists(annotation_file): 3127 annotation_file_found = annotation_file 3128 elif os.path.exists(full_path(annotation_file)): 3129 annotation_file_found = full_path(annotation_file) 3130 else: 3131 # Find within assembly folders 3132 for annotations_database in annotations_databases: 3133 found_files = find_all( 3134 annotation_file, 3135 os.path.join( 3136 annotations_database, assembly 3137 ), 3138 ) 3139 if len(found_files) > 0: 3140 annotation_file_found = found_files[0] 3141 break 3142 if not annotation_file_found and not assembly: 3143 # Find within folders 3144 for ( 3145 annotations_database 3146 ) in annotations_databases: 3147 found_files = find_all( 3148 annotation_file, annotations_database 3149 ) 3150 if len(found_files) > 0: 3151 annotation_file_found = found_files[0] 3152 break 3153 log.debug( 3154 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3155 ) 3156 3157 # Full path 3158 annotation_file_found = full_path(annotation_file_found) 3159 3160 if annotation_file_found: 3161 3162 database = Database(database=annotation_file_found) 3163 quick_annotation_format = database.get_format() 3164 quick_annotation_is_compressed = ( 3165 database.is_compressed() 3166 ) 3167 quick_annotation_is_indexed = os.path.exists( 3168 f"{annotation_file_found}.tbi" 3169 ) 3170 bcftools_preference = False 3171 3172 # Check Annotation Tool 3173 if not annotation_tool: 3174 if ( 3175 bcftools_preference 3176 and quick_annotation_format 3177 in ["vcf", "bed"] 3178 and quick_annotation_is_compressed 3179 and quick_annotation_is_indexed 3180 ): 3181 annotation_tool = "bcftools" 3182 elif quick_annotation_format in [ 3183 "vcf", 3184 "bed", 3185 "tsv", 3186 "tsv", 3187 "csv", 3188 "json", 3189 "tbl", 3190 "parquet", 3191 "duckdb", 3192 ]: 3193 annotation_tool = "parquet" 3194 elif quick_annotation_format in ["bw"]: 3195 annotation_tool = "bigwig" 3196 else: 3197 log.error( 3198 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3199 ) 3200 raise ValueError( 3201 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3202 ) 3203 3204 log.debug( 3205 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3206 ) 3207 3208 # Annotation Tool dispatch 3209 if annotation_tool: 3210 if annotation_tool not in param["annotation"]: 3211 param["annotation"][annotation_tool] = {} 3212 if ( 3213 "annotations" 3214 not in param["annotation"][annotation_tool] 3215 ): 3216 param["annotation"][annotation_tool][ 3217 "annotations" 3218 ] = {} 3219 param["annotation"][annotation_tool][ 3220 "annotations" 3221 ][annotation_file_found] = annotations 3222 3223 else: 3224 log.warning( 3225 f"Quick Annotation File {annotation_file} does NOT exist" 3226 ) 3227 3228 self.set_param(param) 3229 3230 if param.get("annotation", None): 3231 log.info("Annotations") 3232 if param.get("annotation", {}).get("parquet", None): 3233 log.info("Annotations 'parquet'...") 3234 self.annotation_parquet() 3235 if param.get("annotation", {}).get("bcftools", None): 3236 log.info("Annotations 'bcftools'...") 3237 self.annotation_bcftools() 3238 if param.get("annotation", {}).get("snpsift", None): 3239 log.info("Annotations 'snpsift'...") 3240 self.annotation_snpsift() 3241 if param.get("annotation", {}).get("bigwig", None): 3242 log.info("Annotations 'bigwig'...") 3243 self.annotation_bigwig() 3244 if param.get("annotation", {}).get("annovar", None): 3245 log.info("Annotations 'annovar'...") 3246 self.annotation_annovar() 3247 if param.get("annotation", {}).get("snpeff", None): 3248 log.info("Annotations 'snpeff'...") 3249 self.annotation_snpeff() 3250 if param.get("annotation", {}).get("exomiser", None) is not None: 3251 log.info("Annotations 'exomiser'...") 3252 self.annotation_exomiser() 3253 if param.get("annotation", {}).get("splice", None) is not None: 3254 log.info("Annotations 'splice' ...") 3255 self.annotation_splice() 3256 3257 # Explode INFOS fields into table fields 3258 if self.get_explode_infos(): 3259 self.explode_infos( 3260 prefix=self.get_explode_infos_prefix(), 3261 fields=self.get_explode_infos_fields(), 3262 force=True, 3263 )
It annotates the VCF file with the annotations specified in the config file.
3265 def annotation_bigwig(self, threads: int = None) -> None: 3266 """ 3267 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3268 3269 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3270 number of threads to be used for parallel processing during the annotation process. If the 3271 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3272 threads to use based on the system configuration 3273 :type threads: int 3274 :return: True 3275 """ 3276 3277 # DEBUG 3278 log.debug("Start annotation with bigwig databases") 3279 3280 # # Threads 3281 # if not threads: 3282 # threads = self.get_threads() 3283 # log.debug("Threads: " + str(threads)) 3284 3285 # Config 3286 config = self.get_config() 3287 log.debug("Config: " + str(config)) 3288 3289 # Config - BCFTools databases folders 3290 databases_folders = set( 3291 self.get_config() 3292 .get("folders", {}) 3293 .get("databases", {}) 3294 .get("annotations", ["."]) 3295 + self.get_config() 3296 .get("folders", {}) 3297 .get("databases", {}) 3298 .get("bigwig", ["."]) 3299 ) 3300 log.debug("Databases annotations: " + str(databases_folders)) 3301 3302 # Param 3303 annotations = ( 3304 self.get_param() 3305 .get("annotation", {}) 3306 .get("bigwig", {}) 3307 .get("annotations", None) 3308 ) 3309 log.debug("Annotations: " + str(annotations)) 3310 3311 # Assembly 3312 assembly = self.get_param().get( 3313 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3314 ) 3315 3316 # Data 3317 table_variants = self.get_table_variants() 3318 3319 # Check if not empty 3320 log.debug("Check if not empty") 3321 sql_query_chromosomes = ( 3322 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3323 ) 3324 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3325 if not sql_query_chromosomes_df["count"][0]: 3326 log.info(f"VCF empty") 3327 return 3328 3329 # VCF header 3330 vcf_reader = self.get_header() 3331 log.debug("Initial header: " + str(vcf_reader.infos)) 3332 3333 # Existing annotations 3334 for vcf_annotation in self.get_header().infos: 3335 3336 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3337 log.debug( 3338 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3339 ) 3340 3341 if annotations: 3342 3343 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3344 3345 # Export VCF file 3346 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3347 3348 # annotation_bigwig_config 3349 annotation_bigwig_config_list = [] 3350 3351 for annotation in annotations: 3352 annotation_fields = annotations[annotation] 3353 3354 # Annotation Name 3355 annotation_name = os.path.basename(annotation) 3356 3357 if not annotation_fields: 3358 annotation_fields = {"INFO": None} 3359 3360 log.debug(f"Annotation '{annotation_name}'") 3361 log.debug( 3362 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3363 ) 3364 3365 # Create Database 3366 database = Database( 3367 database=annotation, 3368 databases_folders=databases_folders, 3369 assembly=assembly, 3370 ) 3371 3372 # Find files 3373 db_file = database.get_database() 3374 db_file = full_path(db_file) 3375 db_hdr_file = database.get_header_file() 3376 db_hdr_file = full_path(db_hdr_file) 3377 db_file_type = database.get_format() 3378 3379 # If db_file is http ? 3380 if database.get_database().startswith("http"): 3381 3382 # Datbase is HTTP URL 3383 db_file_is_http = True 3384 3385 # DB file keep as URL 3386 db_file = database.get_database() 3387 log.warning( 3388 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3389 ) 3390 3391 # Retrieve automatic annotation field name 3392 annotation_field = clean_annotation_field( 3393 os.path.basename(db_file).replace(".bw", "") 3394 ) 3395 log.debug( 3396 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3397 ) 3398 3399 # Create automatic header file 3400 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3401 with open(db_hdr_file, "w") as f: 3402 f.write("##fileformat=VCFv4.2\n") 3403 f.write( 3404 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3405 ) 3406 f.write(f"#CHROM START END {annotation_field}\n") 3407 3408 else: 3409 3410 # Datbase is NOT HTTP URL 3411 db_file_is_http = False 3412 3413 # Check index - try to create if not exists 3414 if ( 3415 db_file is None 3416 or db_hdr_file is None 3417 or (not os.path.exists(db_file) and not db_file_is_http) 3418 or not os.path.exists(db_hdr_file) 3419 or not db_file_type in ["bw"] 3420 ): 3421 # if False: 3422 log.error("Annotation failed: database not valid") 3423 log.error(f"Annotation annotation file: {db_file}") 3424 log.error(f"Annotation annotation file type: {db_file_type}") 3425 log.error(f"Annotation annotation header: {db_hdr_file}") 3426 raise ValueError( 3427 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3428 ) 3429 else: 3430 3431 # Log 3432 log.debug( 3433 f"Annotation '{annotation}' - file: " 3434 + str(db_file) 3435 + " and " 3436 + str(db_hdr_file) 3437 ) 3438 3439 # Load header as VCF object 3440 db_hdr_vcf = Variants(input=db_hdr_file) 3441 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3442 log.debug( 3443 "Annotation database header: " 3444 + str(db_hdr_vcf_header_infos) 3445 ) 3446 3447 # For all fields in database 3448 annotation_fields_full = False 3449 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3450 annotation_fields = { 3451 key: key for key in db_hdr_vcf_header_infos 3452 } 3453 log.debug( 3454 "Annotation database header - All annotations added: " 3455 + str(annotation_fields) 3456 ) 3457 annotation_fields_full = True 3458 3459 # Init 3460 cyvcf2_header_rename_dict = {} 3461 cyvcf2_header_list = [] 3462 cyvcf2_header_indexes = {} 3463 3464 # process annotation fields 3465 for annotation_field in annotation_fields: 3466 3467 # New annotation name 3468 annotation_field_new = annotation_fields[annotation_field] 3469 3470 # Check annotation field and index in header 3471 if ( 3472 annotation_field 3473 in db_hdr_vcf.get_header_columns_as_list() 3474 ): 3475 annotation_field_index = ( 3476 db_hdr_vcf.get_header_columns_as_list().index( 3477 annotation_field 3478 ) 3479 - 3 3480 ) 3481 cyvcf2_header_indexes[annotation_field_new] = ( 3482 annotation_field_index 3483 ) 3484 else: 3485 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3486 log.error(msg_err) 3487 raise ValueError(msg_err) 3488 3489 # Append annotation field in cyvcf2 header list 3490 cyvcf2_header_rename_dict[annotation_field_new] = ( 3491 db_hdr_vcf_header_infos[annotation_field].id 3492 ) 3493 cyvcf2_header_list.append( 3494 { 3495 "ID": annotation_field_new, 3496 "Number": db_hdr_vcf_header_infos[ 3497 annotation_field 3498 ].num, 3499 "Type": db_hdr_vcf_header_infos[ 3500 annotation_field 3501 ].type, 3502 "Description": db_hdr_vcf_header_infos[ 3503 annotation_field 3504 ].desc, 3505 } 3506 ) 3507 3508 # Add header on VCF 3509 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3510 annotation_field_new, 3511 db_hdr_vcf_header_infos[annotation_field].num, 3512 db_hdr_vcf_header_infos[annotation_field].type, 3513 db_hdr_vcf_header_infos[annotation_field].desc, 3514 "HOWARD BigWig annotation", 3515 "unknown", 3516 self.code_type_map[ 3517 db_hdr_vcf_header_infos[annotation_field].type 3518 ], 3519 ) 3520 3521 # Load bigwig database 3522 bw_db = pyBigWig.open(db_file) 3523 if bw_db.isBigWig(): 3524 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3525 else: 3526 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3527 log.error(msg_err) 3528 raise ValueError(msg_err) 3529 3530 annotation_bigwig_config_list.append( 3531 { 3532 "db_file": db_file, 3533 "bw_db": bw_db, 3534 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3535 "cyvcf2_header_list": cyvcf2_header_list, 3536 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3537 } 3538 ) 3539 3540 # Annotate 3541 if annotation_bigwig_config_list: 3542 3543 # Annotation config 3544 log.debug( 3545 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3546 ) 3547 3548 # Export VCF file 3549 self.export_variant_vcf( 3550 vcf_file=tmp_vcf_name, 3551 remove_info=True, 3552 add_samples=False, 3553 index=True, 3554 ) 3555 3556 # Load input tmp file 3557 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3558 3559 # Add header in input file 3560 for annotation_bigwig_config in annotation_bigwig_config_list: 3561 for cyvcf2_header_field in annotation_bigwig_config.get( 3562 "cyvcf2_header_list", [] 3563 ): 3564 log.info( 3565 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3566 ) 3567 input_vcf.add_info_to_header(cyvcf2_header_field) 3568 3569 # Create output VCF file 3570 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3571 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3572 3573 # Fetch variants 3574 log.info(f"Annotations 'bigwig' start...") 3575 for variant in input_vcf: 3576 3577 for annotation_bigwig_config in annotation_bigwig_config_list: 3578 3579 # DB and indexes 3580 bw_db = annotation_bigwig_config.get("bw_db", None) 3581 cyvcf2_header_indexes = annotation_bigwig_config.get( 3582 "cyvcf2_header_indexes", None 3583 ) 3584 3585 # Retrieve value from chrom pos 3586 res = bw_db.values( 3587 variant.CHROM, variant.POS - 1, variant.POS 3588 ) 3589 3590 # For each annotation fields (and indexes) 3591 for cyvcf2_header_index in cyvcf2_header_indexes: 3592 3593 # If value is NOT nNone 3594 if not np.isnan( 3595 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3596 ): 3597 variant.INFO[cyvcf2_header_index] = res[ 3598 cyvcf2_header_indexes[cyvcf2_header_index] 3599 ] 3600 3601 # Add record in output file 3602 output_vcf.write_record(variant) 3603 3604 # Log 3605 log.debug(f"Annotation done.") 3606 3607 # Close and write file 3608 log.info(f"Annotations 'bigwig' write...") 3609 output_vcf.close() 3610 log.debug(f"Write done.") 3611 3612 # Update variants 3613 log.info(f"Annotations 'bigwig' update...") 3614 self.update_from_vcf(output_vcf_file) 3615 log.debug(f"Update done.") 3616 3617 return True
The function annotation_bigwig annotates variants in a VCF file using bigwig databases.
Parameters
- threads: The
threadsparameter in theannotation_bigwigmethod is used to specify the number of threads to be used for parallel processing during the annotation process. If thethreadsparameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns
True
3619 def annotation_snpsift(self, threads: int = None) -> None: 3620 """ 3621 This function annotate with bcftools 3622 3623 :param threads: Number of threads to use 3624 :return: the value of the variable "return_value". 3625 """ 3626 3627 # DEBUG 3628 log.debug("Start annotation with bcftools databases") 3629 3630 # Threads 3631 if not threads: 3632 threads = self.get_threads() 3633 log.debug("Threads: " + str(threads)) 3634 3635 # Config 3636 config = self.get_config() 3637 log.debug("Config: " + str(config)) 3638 3639 # Config - snpSift 3640 snpsift_bin_command = get_bin_command( 3641 bin="SnpSift.jar", 3642 tool="snpsift", 3643 bin_type="jar", 3644 config=config, 3645 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3646 ) 3647 if not snpsift_bin_command: 3648 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3649 log.error(msg_err) 3650 raise ValueError(msg_err) 3651 3652 # Config - bcftools 3653 bcftools_bin_command = get_bin_command( 3654 bin="bcftools", 3655 tool="bcftools", 3656 bin_type="bin", 3657 config=config, 3658 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3659 ) 3660 if not bcftools_bin_command: 3661 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3662 log.error(msg_err) 3663 raise ValueError(msg_err) 3664 3665 # Config - BCFTools databases folders 3666 databases_folders = set( 3667 self.get_config() 3668 .get("folders", {}) 3669 .get("databases", {}) 3670 .get("annotations", ["."]) 3671 + self.get_config() 3672 .get("folders", {}) 3673 .get("databases", {}) 3674 .get("bcftools", ["."]) 3675 ) 3676 log.debug("Databases annotations: " + str(databases_folders)) 3677 3678 # Param 3679 annotations = ( 3680 self.get_param() 3681 .get("annotation", {}) 3682 .get("snpsift", {}) 3683 .get("annotations", None) 3684 ) 3685 log.debug("Annotations: " + str(annotations)) 3686 3687 # Assembly 3688 assembly = self.get_param().get( 3689 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3690 ) 3691 3692 # Data 3693 table_variants = self.get_table_variants() 3694 3695 # Check if not empty 3696 log.debug("Check if not empty") 3697 sql_query_chromosomes = ( 3698 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3699 ) 3700 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3701 if not sql_query_chromosomes_df["count"][0]: 3702 log.info(f"VCF empty") 3703 return 3704 3705 # VCF header 3706 vcf_reader = self.get_header() 3707 log.debug("Initial header: " + str(vcf_reader.infos)) 3708 3709 # Existing annotations 3710 for vcf_annotation in self.get_header().infos: 3711 3712 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3713 log.debug( 3714 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3715 ) 3716 3717 if annotations: 3718 3719 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3720 3721 # Export VCF file 3722 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3723 3724 # Init 3725 commands = {} 3726 3727 for annotation in annotations: 3728 annotation_fields = annotations[annotation] 3729 3730 # Annotation Name 3731 annotation_name = os.path.basename(annotation) 3732 3733 if not annotation_fields: 3734 annotation_fields = {"INFO": None} 3735 3736 log.debug(f"Annotation '{annotation_name}'") 3737 log.debug( 3738 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3739 ) 3740 3741 # Create Database 3742 database = Database( 3743 database=annotation, 3744 databases_folders=databases_folders, 3745 assembly=assembly, 3746 ) 3747 3748 # Find files 3749 db_file = database.get_database() 3750 db_file = full_path(db_file) 3751 db_hdr_file = database.get_header_file() 3752 db_hdr_file = full_path(db_hdr_file) 3753 db_file_type = database.get_format() 3754 db_tbi_file = f"{db_file}.tbi" 3755 db_file_compressed = database.is_compressed() 3756 3757 # Check if compressed 3758 if not db_file_compressed: 3759 log.error( 3760 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3761 ) 3762 raise ValueError( 3763 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3764 ) 3765 3766 # Check if indexed 3767 if not os.path.exists(db_tbi_file): 3768 log.error( 3769 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3770 ) 3771 raise ValueError( 3772 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3773 ) 3774 3775 # Check index - try to create if not exists 3776 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3777 log.error("Annotation failed: database not valid") 3778 log.error(f"Annotation annotation file: {db_file}") 3779 log.error(f"Annotation annotation header: {db_hdr_file}") 3780 log.error(f"Annotation annotation index: {db_tbi_file}") 3781 raise ValueError( 3782 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3783 ) 3784 else: 3785 3786 log.debug( 3787 f"Annotation '{annotation}' - file: " 3788 + str(db_file) 3789 + " and " 3790 + str(db_hdr_file) 3791 ) 3792 3793 # Load header as VCF object 3794 db_hdr_vcf = Variants(input=db_hdr_file) 3795 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3796 log.debug( 3797 "Annotation database header: " 3798 + str(db_hdr_vcf_header_infos) 3799 ) 3800 3801 # For all fields in database 3802 annotation_fields_full = False 3803 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3804 annotation_fields = { 3805 key: key for key in db_hdr_vcf_header_infos 3806 } 3807 log.debug( 3808 "Annotation database header - All annotations added: " 3809 + str(annotation_fields) 3810 ) 3811 annotation_fields_full = True 3812 3813 # # Create file for field rename 3814 # log.debug("Create file for field rename") 3815 # tmp_rename = NamedTemporaryFile( 3816 # prefix=self.get_prefix(), 3817 # dir=self.get_tmp_dir(), 3818 # suffix=".rename", 3819 # delete=False, 3820 # ) 3821 # tmp_rename_name = tmp_rename.name 3822 # tmp_files.append(tmp_rename_name) 3823 3824 # Number of fields 3825 nb_annotation_field = 0 3826 annotation_list = [] 3827 annotation_infos_rename_list = [] 3828 3829 for annotation_field in annotation_fields: 3830 3831 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3832 annotation_fields_new_name = annotation_fields.get( 3833 annotation_field, annotation_field 3834 ) 3835 if not annotation_fields_new_name: 3836 annotation_fields_new_name = annotation_field 3837 3838 # Check if field is in DB and if field is not elready in input data 3839 if ( 3840 annotation_field in db_hdr_vcf.get_header().infos 3841 and annotation_fields_new_name 3842 not in self.get_header().infos 3843 ): 3844 3845 log.info( 3846 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3847 ) 3848 3849 # BCFTools annotate param to rename fields 3850 if annotation_field != annotation_fields_new_name: 3851 annotation_infos_rename_list.append( 3852 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3853 ) 3854 3855 # Add INFO field to header 3856 db_hdr_vcf_header_infos_number = ( 3857 db_hdr_vcf_header_infos[annotation_field].num or "." 3858 ) 3859 db_hdr_vcf_header_infos_type = ( 3860 db_hdr_vcf_header_infos[annotation_field].type 3861 or "String" 3862 ) 3863 db_hdr_vcf_header_infos_description = ( 3864 db_hdr_vcf_header_infos[annotation_field].desc 3865 or f"{annotation_field} description" 3866 ) 3867 db_hdr_vcf_header_infos_source = ( 3868 db_hdr_vcf_header_infos[annotation_field].source 3869 or "unknown" 3870 ) 3871 db_hdr_vcf_header_infos_version = ( 3872 db_hdr_vcf_header_infos[annotation_field].version 3873 or "unknown" 3874 ) 3875 3876 vcf_reader.infos[annotation_fields_new_name] = ( 3877 vcf.parser._Info( 3878 annotation_fields_new_name, 3879 db_hdr_vcf_header_infos_number, 3880 db_hdr_vcf_header_infos_type, 3881 db_hdr_vcf_header_infos_description, 3882 db_hdr_vcf_header_infos_source, 3883 db_hdr_vcf_header_infos_version, 3884 self.code_type_map[ 3885 db_hdr_vcf_header_infos_type 3886 ], 3887 ) 3888 ) 3889 3890 annotation_list.append(annotation_field) 3891 3892 nb_annotation_field += 1 3893 3894 else: 3895 3896 if ( 3897 annotation_field 3898 not in db_hdr_vcf.get_header().infos 3899 ): 3900 log.warning( 3901 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3902 ) 3903 if ( 3904 annotation_fields_new_name 3905 in self.get_header().infos 3906 ): 3907 log.warning( 3908 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3909 ) 3910 3911 log.info( 3912 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3913 ) 3914 3915 annotation_infos = ",".join(annotation_list) 3916 3917 if annotation_infos != "": 3918 3919 # Annotated VCF (and error file) 3920 tmp_annotation_vcf_name = os.path.join( 3921 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3922 ) 3923 tmp_annotation_vcf_name_err = ( 3924 tmp_annotation_vcf_name + ".err" 3925 ) 3926 3927 # Add fields to annotate 3928 if not annotation_fields_full: 3929 annotation_infos_option = f"-info {annotation_infos}" 3930 else: 3931 annotation_infos_option = "" 3932 3933 # Info fields rename 3934 if annotation_infos_rename_list: 3935 annotation_infos_rename = " -c " + ",".join( 3936 annotation_infos_rename_list 3937 ) 3938 else: 3939 annotation_infos_rename = "" 3940 3941 # Annotate command 3942 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3943 3944 # Add command 3945 commands[command_annotate] = tmp_annotation_vcf_name 3946 3947 if commands: 3948 3949 # Export VCF file 3950 self.export_variant_vcf( 3951 vcf_file=tmp_vcf_name, 3952 remove_info=True, 3953 add_samples=False, 3954 index=True, 3955 ) 3956 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3957 3958 # Num command 3959 nb_command = 0 3960 3961 # Annotate 3962 for command_annotate in commands: 3963 nb_command += 1 3964 log.info( 3965 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3966 ) 3967 log.debug(f"command_annotate={command_annotate}") 3968 run_parallel_commands([command_annotate], threads) 3969 3970 # Debug 3971 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3972 3973 # Update variants 3974 log.info( 3975 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3976 ) 3977 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3979 def annotation_bcftools(self, threads: int = None) -> None: 3980 """ 3981 This function annotate with bcftools 3982 3983 :param threads: Number of threads to use 3984 :return: the value of the variable "return_value". 3985 """ 3986 3987 # DEBUG 3988 log.debug("Start annotation with bcftools databases") 3989 3990 # Threads 3991 if not threads: 3992 threads = self.get_threads() 3993 log.debug("Threads: " + str(threads)) 3994 3995 # Config 3996 config = self.get_config() 3997 log.debug("Config: " + str(config)) 3998 3999 # DEBUG 4000 delete_tmp = True 4001 if self.get_config().get("verbosity", "warning") in ["debug"]: 4002 delete_tmp = False 4003 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4004 4005 # Config - BCFTools bin command 4006 bcftools_bin_command = get_bin_command( 4007 bin="bcftools", 4008 tool="bcftools", 4009 bin_type="bin", 4010 config=config, 4011 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4012 ) 4013 if not bcftools_bin_command: 4014 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4015 log.error(msg_err) 4016 raise ValueError(msg_err) 4017 4018 # Config - BCFTools databases folders 4019 databases_folders = set( 4020 self.get_config() 4021 .get("folders", {}) 4022 .get("databases", {}) 4023 .get("annotations", ["."]) 4024 + self.get_config() 4025 .get("folders", {}) 4026 .get("databases", {}) 4027 .get("bcftools", ["."]) 4028 ) 4029 log.debug("Databases annotations: " + str(databases_folders)) 4030 4031 # Param 4032 annotations = ( 4033 self.get_param() 4034 .get("annotation", {}) 4035 .get("bcftools", {}) 4036 .get("annotations", None) 4037 ) 4038 log.debug("Annotations: " + str(annotations)) 4039 4040 # Assembly 4041 assembly = self.get_param().get( 4042 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4043 ) 4044 4045 # Data 4046 table_variants = self.get_table_variants() 4047 4048 # Check if not empty 4049 log.debug("Check if not empty") 4050 sql_query_chromosomes = ( 4051 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4052 ) 4053 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4054 if not sql_query_chromosomes_df["count"][0]: 4055 log.info(f"VCF empty") 4056 return 4057 4058 # Export in VCF 4059 log.debug("Create initial file to annotate") 4060 tmp_vcf = NamedTemporaryFile( 4061 prefix=self.get_prefix(), 4062 dir=self.get_tmp_dir(), 4063 suffix=".vcf.gz", 4064 delete=False, 4065 ) 4066 tmp_vcf_name = tmp_vcf.name 4067 4068 # VCF header 4069 vcf_reader = self.get_header() 4070 log.debug("Initial header: " + str(vcf_reader.infos)) 4071 4072 # Existing annotations 4073 for vcf_annotation in self.get_header().infos: 4074 4075 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4076 log.debug( 4077 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4078 ) 4079 4080 if annotations: 4081 4082 tmp_ann_vcf_list = [] 4083 commands = [] 4084 tmp_files = [] 4085 err_files = [] 4086 4087 for annotation in annotations: 4088 annotation_fields = annotations[annotation] 4089 4090 # Annotation Name 4091 annotation_name = os.path.basename(annotation) 4092 4093 if not annotation_fields: 4094 annotation_fields = {"INFO": None} 4095 4096 log.debug(f"Annotation '{annotation_name}'") 4097 log.debug( 4098 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4099 ) 4100 4101 # Create Database 4102 database = Database( 4103 database=annotation, 4104 databases_folders=databases_folders, 4105 assembly=assembly, 4106 ) 4107 4108 # Find files 4109 db_file = database.get_database() 4110 db_file = full_path(db_file) 4111 db_hdr_file = database.get_header_file() 4112 db_hdr_file = full_path(db_hdr_file) 4113 db_file_type = database.get_format() 4114 db_tbi_file = f"{db_file}.tbi" 4115 db_file_compressed = database.is_compressed() 4116 4117 # Check if compressed 4118 if not db_file_compressed: 4119 log.error( 4120 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4121 ) 4122 raise ValueError( 4123 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4124 ) 4125 4126 # Check if indexed 4127 if not os.path.exists(db_tbi_file): 4128 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4129 raise ValueError( 4130 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4131 ) 4132 4133 # Check index - try to create if not exists 4134 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4135 log.error("Annotation failed: database not valid") 4136 log.error(f"Annotation annotation file: {db_file}") 4137 log.error(f"Annotation annotation header: {db_hdr_file}") 4138 log.error(f"Annotation annotation index: {db_tbi_file}") 4139 raise ValueError( 4140 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4141 ) 4142 else: 4143 4144 log.debug( 4145 f"Annotation '{annotation}' - file: " 4146 + str(db_file) 4147 + " and " 4148 + str(db_hdr_file) 4149 ) 4150 4151 # Load header as VCF object 4152 db_hdr_vcf = Variants(input=db_hdr_file) 4153 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4154 log.debug( 4155 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4156 ) 4157 4158 # For all fields in database 4159 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4160 annotation_fields = { 4161 key: key for key in db_hdr_vcf_header_infos 4162 } 4163 log.debug( 4164 "Annotation database header - All annotations added: " 4165 + str(annotation_fields) 4166 ) 4167 4168 # Number of fields 4169 nb_annotation_field = 0 4170 annotation_list = [] 4171 4172 for annotation_field in annotation_fields: 4173 4174 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4175 annotation_fields_new_name = annotation_fields.get( 4176 annotation_field, annotation_field 4177 ) 4178 if not annotation_fields_new_name: 4179 annotation_fields_new_name = annotation_field 4180 4181 # Check if field is in DB and if field is not elready in input data 4182 if ( 4183 annotation_field in db_hdr_vcf.get_header().infos 4184 and annotation_fields_new_name 4185 not in self.get_header().infos 4186 ): 4187 4188 log.info( 4189 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4190 ) 4191 4192 # Add INFO field to header 4193 db_hdr_vcf_header_infos_number = ( 4194 db_hdr_vcf_header_infos[annotation_field].num or "." 4195 ) 4196 db_hdr_vcf_header_infos_type = ( 4197 db_hdr_vcf_header_infos[annotation_field].type 4198 or "String" 4199 ) 4200 db_hdr_vcf_header_infos_description = ( 4201 db_hdr_vcf_header_infos[annotation_field].desc 4202 or f"{annotation_field} description" 4203 ) 4204 db_hdr_vcf_header_infos_source = ( 4205 db_hdr_vcf_header_infos[annotation_field].source 4206 or "unknown" 4207 ) 4208 db_hdr_vcf_header_infos_version = ( 4209 db_hdr_vcf_header_infos[annotation_field].version 4210 or "unknown" 4211 ) 4212 4213 vcf_reader.infos[annotation_fields_new_name] = ( 4214 vcf.parser._Info( 4215 annotation_fields_new_name, 4216 db_hdr_vcf_header_infos_number, 4217 db_hdr_vcf_header_infos_type, 4218 db_hdr_vcf_header_infos_description, 4219 db_hdr_vcf_header_infos_source, 4220 db_hdr_vcf_header_infos_version, 4221 self.code_type_map[db_hdr_vcf_header_infos_type], 4222 ) 4223 ) 4224 4225 # annotation_list.append(annotation_field) 4226 if annotation_field != annotation_fields_new_name: 4227 annotation_list.append( 4228 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4229 ) 4230 else: 4231 annotation_list.append(annotation_field) 4232 4233 nb_annotation_field += 1 4234 4235 else: 4236 4237 if annotation_field not in db_hdr_vcf.get_header().infos: 4238 log.warning( 4239 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4240 ) 4241 if annotation_fields_new_name in self.get_header().infos: 4242 log.warning( 4243 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4244 ) 4245 4246 log.info( 4247 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4248 ) 4249 4250 annotation_infos = ",".join(annotation_list) 4251 4252 if annotation_infos != "": 4253 4254 # Protect header for bcftools (remove "#CHROM" and variants line) 4255 log.debug("Protect Header file - remove #CHROM line if exists") 4256 tmp_header_vcf = NamedTemporaryFile( 4257 prefix=self.get_prefix(), 4258 dir=self.get_tmp_dir(), 4259 suffix=".hdr", 4260 delete=False, 4261 ) 4262 tmp_header_vcf_name = tmp_header_vcf.name 4263 tmp_files.append(tmp_header_vcf_name) 4264 # Command 4265 if db_hdr_file.endswith(".gz"): 4266 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4267 else: 4268 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 # Run 4270 run_parallel_commands([command_extract_header], 1) 4271 4272 # Find chomosomes 4273 log.debug("Find chromosomes ") 4274 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4275 sql_query_chromosomes_df = self.get_query_to_df( 4276 sql_query_chromosomes 4277 ) 4278 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4279 4280 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4281 4282 # BED columns in the annotation file 4283 if db_file_type in ["bed"]: 4284 annotation_infos = "CHROM,POS,POS," + annotation_infos 4285 4286 for chrom in chomosomes_list: 4287 4288 # Create BED on initial VCF 4289 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4290 tmp_bed = NamedTemporaryFile( 4291 prefix=self.get_prefix(), 4292 dir=self.get_tmp_dir(), 4293 suffix=".bed", 4294 delete=False, 4295 ) 4296 tmp_bed_name = tmp_bed.name 4297 tmp_files.append(tmp_bed_name) 4298 4299 # Detecte regions 4300 log.debug( 4301 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4302 ) 4303 window = 1000000 4304 sql_query_intervals_for_bed = f""" 4305 SELECT \"#CHROM\", 4306 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4307 \"POS\"+{window} 4308 FROM {table_variants} as table_variants 4309 WHERE table_variants.\"#CHROM\" = '{chrom}' 4310 """ 4311 regions = self.conn.execute( 4312 sql_query_intervals_for_bed 4313 ).fetchall() 4314 merged_regions = merge_regions(regions) 4315 log.debug( 4316 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4317 ) 4318 4319 header = ["#CHROM", "START", "END"] 4320 with open(tmp_bed_name, "w") as f: 4321 # Write the header with tab delimiter 4322 f.write("\t".join(header) + "\n") 4323 for d in merged_regions: 4324 # Write each data row with tab delimiter 4325 f.write("\t".join(map(str, d)) + "\n") 4326 4327 # Tmp files 4328 tmp_annotation_vcf = NamedTemporaryFile( 4329 prefix=self.get_prefix(), 4330 dir=self.get_tmp_dir(), 4331 suffix=".vcf.gz", 4332 delete=False, 4333 ) 4334 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4335 tmp_files.append(tmp_annotation_vcf_name) 4336 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4337 tmp_annotation_vcf_name_err = ( 4338 tmp_annotation_vcf_name + ".err" 4339 ) 4340 err_files.append(tmp_annotation_vcf_name_err) 4341 4342 # Annotate Command 4343 log.debug( 4344 f"Annotation '{annotation}' - add bcftools command" 4345 ) 4346 4347 # Command 4348 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4349 4350 # Add command 4351 commands.append(command_annotate) 4352 4353 # if some commands 4354 if commands: 4355 4356 # Export VCF file 4357 self.export_variant_vcf( 4358 vcf_file=tmp_vcf_name, 4359 remove_info=True, 4360 add_samples=False, 4361 index=True, 4362 ) 4363 4364 # Threads 4365 # calculate threads for annotated commands 4366 if commands: 4367 threads_bcftools_annotate = round(threads / len(commands)) 4368 else: 4369 threads_bcftools_annotate = 1 4370 4371 if not threads_bcftools_annotate: 4372 threads_bcftools_annotate = 1 4373 4374 # Add threads option to bcftools commands 4375 if threads_bcftools_annotate > 1: 4376 commands_threaded = [] 4377 for command in commands: 4378 commands_threaded.append( 4379 command.replace( 4380 f"{bcftools_bin_command} annotate ", 4381 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4382 ) 4383 ) 4384 commands = commands_threaded 4385 4386 # Command annotation multithreading 4387 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4388 log.info( 4389 f"Annotation - Annotation multithreaded in " 4390 + str(len(commands)) 4391 + " commands" 4392 ) 4393 4394 run_parallel_commands(commands, threads) 4395 4396 # Merge 4397 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4398 4399 if tmp_ann_vcf_list_cmd: 4400 4401 # Tmp file 4402 tmp_annotate_vcf = NamedTemporaryFile( 4403 prefix=self.get_prefix(), 4404 dir=self.get_tmp_dir(), 4405 suffix=".vcf.gz", 4406 delete=True, 4407 ) 4408 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4409 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4410 err_files.append(tmp_annotate_vcf_name_err) 4411 4412 # Tmp file remove command 4413 tmp_files_remove_command = "" 4414 if tmp_files: 4415 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4416 4417 # Command merge 4418 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4419 log.info( 4420 f"Annotation - Annotation merging " 4421 + str(len(commands)) 4422 + " annotated files" 4423 ) 4424 log.debug(f"Annotation - merge command: {merge_command}") 4425 run_parallel_commands([merge_command], 1) 4426 4427 # Error messages 4428 log.info(f"Error/Warning messages:") 4429 error_message_command_all = [] 4430 error_message_command_warning = [] 4431 error_message_command_err = [] 4432 for err_file in err_files: 4433 with open(err_file, "r") as f: 4434 for line in f: 4435 message = line.strip() 4436 error_message_command_all.append(message) 4437 if line.startswith("[W::"): 4438 error_message_command_warning.append(message) 4439 if line.startswith("[E::"): 4440 error_message_command_err.append( 4441 f"{err_file}: " + message 4442 ) 4443 # log info 4444 for message in list( 4445 set(error_message_command_err + error_message_command_warning) 4446 ): 4447 log.info(f" {message}") 4448 # debug info 4449 for message in list(set(error_message_command_all)): 4450 log.debug(f" {message}") 4451 # failed 4452 if len(error_message_command_err): 4453 log.error("Annotation failed: Error in commands") 4454 raise ValueError("Annotation failed: Error in commands") 4455 4456 # Update variants 4457 log.info(f"Annotation - Updating...") 4458 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4460 def annotation_exomiser(self, threads: int = None) -> None: 4461 """ 4462 This function annotate with Exomiser 4463 4464 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4465 - "analysis" (dict/file): 4466 Full analysis dictionnary parameters (see Exomiser docs). 4467 Either a dict, or a file in JSON or YAML format. 4468 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4469 Default : None 4470 - "preset" (string): 4471 Analysis preset (available in config folder). 4472 Used if no full "analysis" is provided. 4473 Default: "exome" 4474 - "phenopacket" (dict/file): 4475 Samples and phenotipic features parameters (see Exomiser docs). 4476 Either a dict, or a file in JSON or YAML format. 4477 Default: None 4478 - "subject" (dict): 4479 Sample parameters (see Exomiser docs). 4480 Example: 4481 "subject": 4482 { 4483 "id": "ISDBM322017", 4484 "sex": "FEMALE" 4485 } 4486 Default: None 4487 - "sample" (string): 4488 Sample name to construct "subject" section: 4489 "subject": 4490 { 4491 "id": "<sample>", 4492 "sex": "UNKNOWN_SEX" 4493 } 4494 Default: None 4495 - "phenotypicFeatures" (dict) 4496 Phenotypic features to construct "subject" section. 4497 Example: 4498 "phenotypicFeatures": 4499 [ 4500 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4501 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4502 ] 4503 - "hpo" (list) 4504 List of HPO ids as phenotypic features. 4505 Example: 4506 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4507 Default: [] 4508 - "outputOptions" (dict): 4509 Output options (see Exomiser docs). 4510 Default: 4511 "output_options" = 4512 { 4513 "outputContributingVariantsOnly": False, 4514 "numGenes": 0, 4515 "outputFormats": ["TSV_VARIANT", "VCF"] 4516 } 4517 - "transcript_source" (string): 4518 Transcript source (either "refseq", "ucsc", "ensembl") 4519 Default: "refseq" 4520 - "exomiser_to_info" (boolean): 4521 Add exomiser TSV file columns as INFO fields in VCF. 4522 Default: False 4523 - "release" (string): 4524 Exomise database release. 4525 If not exists, database release will be downloaded (take a while). 4526 Default: None (provided by application.properties configuration file) 4527 - "exomiser_application_properties" (file): 4528 Exomiser configuration file (see Exomiser docs). 4529 Useful to automatically download databases (especially for specific genome databases). 4530 4531 Notes: 4532 - If no sample in parameters, first sample in VCF will be chosen 4533 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4534 4535 :param threads: The number of threads to use 4536 :return: None. 4537 """ 4538 4539 # DEBUG 4540 log.debug("Start annotation with Exomiser databases") 4541 4542 # Threads 4543 if not threads: 4544 threads = self.get_threads() 4545 log.debug("Threads: " + str(threads)) 4546 4547 # Config 4548 config = self.get_config() 4549 log.debug("Config: " + str(config)) 4550 4551 # Config - Folders - Databases 4552 databases_folders = ( 4553 config.get("folders", {}) 4554 .get("databases", {}) 4555 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4556 ) 4557 databases_folders = full_path(databases_folders) 4558 if not os.path.exists(databases_folders): 4559 log.error(f"Databases annotations: {databases_folders} NOT found") 4560 log.debug("Databases annotations: " + str(databases_folders)) 4561 4562 # Config - Exomiser 4563 exomiser_bin_command = get_bin_command( 4564 bin="exomiser-cli*.jar", 4565 tool="exomiser", 4566 bin_type="jar", 4567 config=config, 4568 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4569 ) 4570 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4571 if not exomiser_bin_command: 4572 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4573 log.error(msg_err) 4574 raise ValueError(msg_err) 4575 4576 # Param 4577 param = self.get_param() 4578 log.debug("Param: " + str(param)) 4579 4580 # Param - Exomiser 4581 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4582 log.debug(f"Param Exomiser: {param_exomiser}") 4583 4584 # Param - Assembly 4585 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4586 log.debug("Assembly: " + str(assembly)) 4587 4588 # Data 4589 table_variants = self.get_table_variants() 4590 4591 # Check if not empty 4592 log.debug("Check if not empty") 4593 sql_query_chromosomes = ( 4594 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4595 ) 4596 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4597 log.info(f"VCF empty") 4598 return False 4599 4600 # VCF header 4601 vcf_reader = self.get_header() 4602 log.debug("Initial header: " + str(vcf_reader.infos)) 4603 4604 # Samples 4605 samples = self.get_header_sample_list() 4606 if not samples: 4607 log.error("No Samples in VCF") 4608 return False 4609 log.debug(f"Samples: {samples}") 4610 4611 # Memory limit 4612 memory_limit = self.get_memory("8G") 4613 log.debug(f"memory_limit: {memory_limit}") 4614 4615 # Exomiser java options 4616 exomiser_java_options = ( 4617 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4618 ) 4619 log.debug(f"Exomiser java options: {exomiser_java_options}") 4620 4621 # Download Exomiser (if not exists) 4622 exomiser_release = param_exomiser.get("release", None) 4623 exomiser_application_properties = param_exomiser.get( 4624 "exomiser_application_properties", None 4625 ) 4626 databases_download_exomiser( 4627 assemblies=[assembly], 4628 exomiser_folder=databases_folders, 4629 exomiser_release=exomiser_release, 4630 exomiser_phenotype_release=exomiser_release, 4631 exomiser_application_properties=exomiser_application_properties, 4632 ) 4633 4634 # Force annotation 4635 force_update_annotation = True 4636 4637 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4638 log.debug("Start annotation Exomiser") 4639 4640 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4641 4642 # tmp_dir = "/tmp/exomiser" 4643 4644 ### ANALYSIS ### 4645 ################ 4646 4647 # Create analysis.json through analysis dict 4648 # either analysis in param or by default 4649 # depending on preset exome/genome) 4650 4651 # Init analysis dict 4652 param_exomiser_analysis_dict = {} 4653 4654 # analysis from param 4655 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4656 param_exomiser_analysis = full_path(param_exomiser_analysis) 4657 4658 # If analysis in param -> load anlaysis json 4659 if param_exomiser_analysis: 4660 4661 # If param analysis is a file and exists 4662 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4663 param_exomiser_analysis 4664 ): 4665 # Load analysis file into analysis dict (either yaml or json) 4666 with open(param_exomiser_analysis) as json_file: 4667 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4668 4669 # If param analysis is a dict 4670 elif isinstance(param_exomiser_analysis, dict): 4671 # Load analysis dict into analysis dict (either yaml or json) 4672 param_exomiser_analysis_dict = param_exomiser_analysis 4673 4674 # Error analysis type 4675 else: 4676 log.error(f"Analysis type unknown. Check param file.") 4677 raise ValueError(f"Analysis type unknown. Check param file.") 4678 4679 # Case no input analysis config file/dict 4680 # Use preset (exome/genome) to open default config file 4681 if not param_exomiser_analysis_dict: 4682 4683 # default preset 4684 default_preset = "exome" 4685 4686 # Get param preset or default preset 4687 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4688 4689 # Try to find if preset is a file 4690 if os.path.exists(param_exomiser_preset): 4691 # Preset file is provided in full path 4692 param_exomiser_analysis_default_config_file = ( 4693 param_exomiser_preset 4694 ) 4695 # elif os.path.exists(full_path(param_exomiser_preset)): 4696 # # Preset file is provided in full path 4697 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4698 elif os.path.exists( 4699 os.path.join(folder_config, param_exomiser_preset) 4700 ): 4701 # Preset file is provided a basename in config folder (can be a path with subfolders) 4702 param_exomiser_analysis_default_config_file = os.path.join( 4703 folder_config, param_exomiser_preset 4704 ) 4705 else: 4706 # Construct preset file 4707 param_exomiser_analysis_default_config_file = os.path.join( 4708 folder_config, 4709 f"preset-{param_exomiser_preset}-analysis.json", 4710 ) 4711 4712 # If preset file exists 4713 param_exomiser_analysis_default_config_file = full_path( 4714 param_exomiser_analysis_default_config_file 4715 ) 4716 if os.path.exists(param_exomiser_analysis_default_config_file): 4717 # Load prest file into analysis dict (either yaml or json) 4718 with open( 4719 param_exomiser_analysis_default_config_file 4720 ) as json_file: 4721 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4722 json_file 4723 ) 4724 4725 # Error preset file 4726 else: 4727 log.error( 4728 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4729 ) 4730 raise ValueError( 4731 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4732 ) 4733 4734 # If no analysis dict created 4735 if not param_exomiser_analysis_dict: 4736 log.error(f"No analysis config") 4737 raise ValueError(f"No analysis config") 4738 4739 # Log 4740 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4741 4742 ### PHENOPACKET ### 4743 ################### 4744 4745 # If no PhenoPacket in analysis dict -> check in param 4746 if "phenopacket" not in param_exomiser_analysis_dict: 4747 4748 # If PhenoPacket in param -> load anlaysis json 4749 if param_exomiser.get("phenopacket", None): 4750 4751 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4752 param_exomiser_phenopacket = full_path( 4753 param_exomiser_phenopacket 4754 ) 4755 4756 # If param phenopacket is a file and exists 4757 if isinstance( 4758 param_exomiser_phenopacket, str 4759 ) and os.path.exists(param_exomiser_phenopacket): 4760 # Load phenopacket file into analysis dict (either yaml or json) 4761 with open(param_exomiser_phenopacket) as json_file: 4762 param_exomiser_analysis_dict["phenopacket"] = ( 4763 yaml.safe_load(json_file) 4764 ) 4765 4766 # If param phenopacket is a dict 4767 elif isinstance(param_exomiser_phenopacket, dict): 4768 # Load phenopacket dict into analysis dict (either yaml or json) 4769 param_exomiser_analysis_dict["phenopacket"] = ( 4770 param_exomiser_phenopacket 4771 ) 4772 4773 # Error phenopacket type 4774 else: 4775 log.error(f"Phenopacket type unknown. Check param file.") 4776 raise ValueError( 4777 f"Phenopacket type unknown. Check param file." 4778 ) 4779 4780 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4781 if "phenopacket" not in param_exomiser_analysis_dict: 4782 4783 # Init PhenoPacket 4784 param_exomiser_analysis_dict["phenopacket"] = { 4785 "id": "analysis", 4786 "proband": {}, 4787 } 4788 4789 ### Add subject ### 4790 4791 # If subject exists 4792 param_exomiser_subject = param_exomiser.get("subject", {}) 4793 4794 # If subject not exists -> found sample ID 4795 if not param_exomiser_subject: 4796 4797 # Found sample ID in param 4798 sample = param_exomiser.get("sample", None) 4799 4800 # Find sample ID (first sample) 4801 if not sample: 4802 sample_list = self.get_header_sample_list() 4803 if len(sample_list) > 0: 4804 sample = sample_list[0] 4805 else: 4806 log.error(f"No sample found") 4807 raise ValueError(f"No sample found") 4808 4809 # Create subject 4810 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4811 4812 # Add to dict 4813 param_exomiser_analysis_dict["phenopacket"][ 4814 "subject" 4815 ] = param_exomiser_subject 4816 4817 ### Add "phenotypicFeatures" ### 4818 4819 # If phenotypicFeatures exists 4820 param_exomiser_phenotypicfeatures = param_exomiser.get( 4821 "phenotypicFeatures", [] 4822 ) 4823 4824 # If phenotypicFeatures not exists -> Try to infer from hpo list 4825 if not param_exomiser_phenotypicfeatures: 4826 4827 # Found HPO in param 4828 param_exomiser_hpo = param_exomiser.get("hpo", []) 4829 4830 # Split HPO if list in string format separated by comma 4831 if isinstance(param_exomiser_hpo, str): 4832 param_exomiser_hpo = param_exomiser_hpo.split(",") 4833 4834 # Create HPO list 4835 for hpo in param_exomiser_hpo: 4836 hpo_clean = re.sub("[^0-9]", "", hpo) 4837 param_exomiser_phenotypicfeatures.append( 4838 { 4839 "type": { 4840 "id": f"HP:{hpo_clean}", 4841 "label": f"HP:{hpo_clean}", 4842 } 4843 } 4844 ) 4845 4846 # Add to dict 4847 param_exomiser_analysis_dict["phenopacket"][ 4848 "phenotypicFeatures" 4849 ] = param_exomiser_phenotypicfeatures 4850 4851 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4852 if not param_exomiser_phenotypicfeatures: 4853 for step in param_exomiser_analysis_dict.get( 4854 "analysis", {} 4855 ).get("steps", []): 4856 if "hiPhivePrioritiser" in step: 4857 param_exomiser_analysis_dict.get("analysis", {}).get( 4858 "steps", [] 4859 ).remove(step) 4860 4861 ### Add Input File ### 4862 4863 # Initial file name and htsFiles 4864 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4865 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4866 { 4867 "uri": tmp_vcf_name, 4868 "htsFormat": "VCF", 4869 "genomeAssembly": assembly, 4870 } 4871 ] 4872 4873 ### Add metaData ### 4874 4875 # If metaData not in analysis dict 4876 if "metaData" not in param_exomiser_analysis_dict: 4877 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4878 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4879 "createdBy": "howard", 4880 "phenopacketSchemaVersion": 1, 4881 } 4882 4883 ### OutputOptions ### 4884 4885 # Init output result folder 4886 output_results = os.path.join(tmp_dir, "results") 4887 4888 # If no outputOptions in analysis dict 4889 if "outputOptions" not in param_exomiser_analysis_dict: 4890 4891 # default output formats 4892 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4893 4894 # Get outputOptions in param 4895 output_options = param_exomiser.get("outputOptions", None) 4896 4897 # If no output_options in param -> check 4898 if not output_options: 4899 output_options = { 4900 "outputContributingVariantsOnly": False, 4901 "numGenes": 0, 4902 "outputFormats": defaut_output_formats, 4903 } 4904 4905 # Replace outputDirectory in output options 4906 output_options["outputDirectory"] = output_results 4907 output_options["outputFileName"] = "howard" 4908 4909 # Add outputOptions in analysis dict 4910 param_exomiser_analysis_dict["outputOptions"] = output_options 4911 4912 else: 4913 4914 # Replace output_results and output format (if exists in param) 4915 param_exomiser_analysis_dict["outputOptions"][ 4916 "outputDirectory" 4917 ] = output_results 4918 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4919 list( 4920 set( 4921 param_exomiser_analysis_dict.get( 4922 "outputOptions", {} 4923 ).get("outputFormats", []) 4924 + ["TSV_VARIANT", "VCF"] 4925 ) 4926 ) 4927 ) 4928 4929 # log 4930 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4931 4932 ### ANALYSIS FILE ### 4933 ##################### 4934 4935 ### Full JSON analysis config file ### 4936 4937 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4938 with open(exomiser_analysis, "w") as fp: 4939 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4940 4941 ### SPLIT analysis and sample config files 4942 4943 # Splitted analysis dict 4944 param_exomiser_analysis_dict_for_split = ( 4945 param_exomiser_analysis_dict.copy() 4946 ) 4947 4948 # Phenopacket JSON file 4949 exomiser_analysis_phenopacket = os.path.join( 4950 tmp_dir, "analysis_phenopacket.json" 4951 ) 4952 with open(exomiser_analysis_phenopacket, "w") as fp: 4953 json.dump( 4954 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4955 fp, 4956 indent=4, 4957 ) 4958 4959 # Analysis JSON file without Phenopacket parameters 4960 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4961 exomiser_analysis_analysis = os.path.join( 4962 tmp_dir, "analysis_analysis.json" 4963 ) 4964 with open(exomiser_analysis_analysis, "w") as fp: 4965 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4966 4967 ### INITAL VCF file ### 4968 ####################### 4969 4970 ### Create list of samples to use and include inti initial VCF file #### 4971 4972 # Subject (main sample) 4973 # Get sample ID in analysis dict 4974 sample_subject = ( 4975 param_exomiser_analysis_dict.get("phenopacket", {}) 4976 .get("subject", {}) 4977 .get("id", None) 4978 ) 4979 sample_proband = ( 4980 param_exomiser_analysis_dict.get("phenopacket", {}) 4981 .get("proband", {}) 4982 .get("subject", {}) 4983 .get("id", None) 4984 ) 4985 sample = [] 4986 if sample_subject: 4987 sample.append(sample_subject) 4988 if sample_proband: 4989 sample.append(sample_proband) 4990 4991 # Get sample ID within Pedigree 4992 pedigree_persons_list = ( 4993 param_exomiser_analysis_dict.get("phenopacket", {}) 4994 .get("pedigree", {}) 4995 .get("persons", {}) 4996 ) 4997 4998 # Create list with all sample ID in pedigree (if exists) 4999 pedigree_persons = [] 5000 for person in pedigree_persons_list: 5001 pedigree_persons.append(person.get("individualId")) 5002 5003 # Concat subject sample ID and samples ID in pedigreesamples 5004 samples = list(set(sample + pedigree_persons)) 5005 5006 # Check if sample list is not empty 5007 if not samples: 5008 log.error(f"No samples found") 5009 raise ValueError(f"No samples found") 5010 5011 # Create VCF with sample (either sample in param or first one by default) 5012 # Export VCF file 5013 self.export_variant_vcf( 5014 vcf_file=tmp_vcf_name, 5015 remove_info=True, 5016 add_samples=True, 5017 list_samples=samples, 5018 index=False, 5019 ) 5020 5021 ### Execute Exomiser ### 5022 ######################## 5023 5024 # Init command 5025 exomiser_command = "" 5026 5027 # Command exomiser options 5028 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5029 5030 # Release 5031 exomiser_release = param_exomiser.get("release", None) 5032 if exomiser_release: 5033 # phenotype data version 5034 exomiser_options += ( 5035 f" --exomiser.phenotype.data-version={exomiser_release} " 5036 ) 5037 # data version 5038 exomiser_options += ( 5039 f" --exomiser.{assembly}.data-version={exomiser_release} " 5040 ) 5041 # variant white list 5042 variant_white_list_file = ( 5043 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5044 ) 5045 if os.path.exists( 5046 os.path.join( 5047 databases_folders, assembly, variant_white_list_file 5048 ) 5049 ): 5050 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5051 5052 # transcript_source 5053 transcript_source = param_exomiser.get( 5054 "transcript_source", None 5055 ) # ucsc, refseq, ensembl 5056 if transcript_source: 5057 exomiser_options += ( 5058 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5059 ) 5060 5061 # If analysis contain proband param 5062 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5063 "proband", {} 5064 ): 5065 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5066 5067 # If no proband (usually uniq sample) 5068 else: 5069 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5070 5071 # Log 5072 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5073 5074 # Run command 5075 result = subprocess.call( 5076 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5077 ) 5078 if result: 5079 log.error("Exomiser command failed") 5080 raise ValueError("Exomiser command failed") 5081 5082 ### RESULTS ### 5083 ############### 5084 5085 ### Annotate with TSV fields ### 5086 5087 # Init result tsv file 5088 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5089 5090 # Init result tsv file 5091 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5092 5093 # Parse TSV file and explode columns in INFO field 5094 if exomiser_to_info and os.path.exists(output_results_tsv): 5095 5096 # Log 5097 log.debug("Exomiser columns to VCF INFO field") 5098 5099 # Retrieve columns and types 5100 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5101 output_results_tsv_df = self.get_query_to_df(query) 5102 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5103 5104 # Init concat fields for update 5105 sql_query_update_concat_fields = [] 5106 5107 # Fields to avoid 5108 fields_to_avoid = [ 5109 "CONTIG", 5110 "START", 5111 "END", 5112 "REF", 5113 "ALT", 5114 "QUAL", 5115 "FILTER", 5116 "GENOTYPE", 5117 ] 5118 5119 # List all columns to add into header 5120 for header_column in output_results_tsv_columns: 5121 5122 # If header column is enable 5123 if header_column not in fields_to_avoid: 5124 5125 # Header info type 5126 header_info_type = "String" 5127 header_column_df = output_results_tsv_df[header_column] 5128 header_column_df_dtype = header_column_df.dtype 5129 if header_column_df_dtype == object: 5130 if ( 5131 pd.to_numeric(header_column_df, errors="coerce") 5132 .notnull() 5133 .all() 5134 ): 5135 header_info_type = "Float" 5136 else: 5137 header_info_type = "Integer" 5138 5139 # Header info 5140 characters_to_validate = ["-"] 5141 pattern = "[" + "".join(characters_to_validate) + "]" 5142 header_info_name = re.sub( 5143 pattern, 5144 "_", 5145 f"Exomiser_{header_column}".replace("#", ""), 5146 ) 5147 header_info_number = "." 5148 header_info_description = ( 5149 f"Exomiser {header_column} annotation" 5150 ) 5151 header_info_source = "Exomiser" 5152 header_info_version = "unknown" 5153 header_info_code = CODE_TYPE_MAP[header_info_type] 5154 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5155 header_info_name, 5156 header_info_number, 5157 header_info_type, 5158 header_info_description, 5159 header_info_source, 5160 header_info_version, 5161 header_info_code, 5162 ) 5163 5164 # Add field to add for update to concat fields 5165 sql_query_update_concat_fields.append( 5166 f""" 5167 CASE 5168 WHEN table_parquet."{header_column}" NOT IN ('','.') 5169 THEN concat( 5170 '{header_info_name}=', 5171 table_parquet."{header_column}", 5172 ';' 5173 ) 5174 5175 ELSE '' 5176 END 5177 """ 5178 ) 5179 5180 # Update query 5181 sql_query_update = f""" 5182 UPDATE {table_variants} as table_variants 5183 SET INFO = concat( 5184 CASE 5185 WHEN INFO NOT IN ('', '.') 5186 THEN INFO 5187 ELSE '' 5188 END, 5189 CASE 5190 WHEN table_variants.INFO NOT IN ('','.') 5191 THEN ';' 5192 ELSE '' 5193 END, 5194 ( 5195 SELECT 5196 concat( 5197 {",".join(sql_query_update_concat_fields)} 5198 ) 5199 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5200 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5201 AND table_parquet.\"START\" = table_variants.\"POS\" 5202 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5203 AND table_parquet.\"REF\" = table_variants.\"REF\" 5204 ) 5205 ) 5206 ; 5207 """ 5208 5209 # Update 5210 self.conn.execute(sql_query_update) 5211 5212 ### Annotate with VCF INFO field ### 5213 5214 # Init result VCF file 5215 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5216 5217 # If VCF exists 5218 if os.path.exists(output_results_vcf): 5219 5220 # Log 5221 log.debug("Exomiser result VCF update variants") 5222 5223 # Find Exomiser INFO field annotation in header 5224 with gzip.open(output_results_vcf, "rt") as f: 5225 header_list = self.read_vcf_header(f) 5226 exomiser_vcf_header = vcf.Reader( 5227 io.StringIO("\n".join(header_list)) 5228 ) 5229 5230 # Add annotation INFO field to header 5231 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5232 5233 # Update variants with VCF 5234 self.update_from_vcf(output_results_vcf) 5235 5236 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
5238 def annotation_snpeff(self, threads: int = None) -> None: 5239 """ 5240 This function annotate with snpEff 5241 5242 :param threads: The number of threads to use 5243 :return: the value of the variable "return_value". 5244 """ 5245 5246 # DEBUG 5247 log.debug("Start annotation with snpeff databases") 5248 5249 # Threads 5250 if not threads: 5251 threads = self.get_threads() 5252 log.debug("Threads: " + str(threads)) 5253 5254 # DEBUG 5255 delete_tmp = True 5256 if self.get_config().get("verbosity", "warning") in ["debug"]: 5257 delete_tmp = False 5258 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5259 5260 # Config 5261 config = self.get_config() 5262 log.debug("Config: " + str(config)) 5263 5264 # Config - Folders - Databases 5265 databases_folders = ( 5266 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5267 ) 5268 log.debug("Databases annotations: " + str(databases_folders)) 5269 5270 # Config - snpEff bin command 5271 snpeff_bin_command = get_bin_command( 5272 bin="snpEff.jar", 5273 tool="snpeff", 5274 bin_type="jar", 5275 config=config, 5276 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5277 ) 5278 if not snpeff_bin_command: 5279 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5280 log.error(msg_err) 5281 raise ValueError(msg_err) 5282 5283 # Config - snpEff databases 5284 snpeff_databases = ( 5285 config.get("folders", {}) 5286 .get("databases", {}) 5287 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5288 ) 5289 snpeff_databases = full_path(snpeff_databases) 5290 if snpeff_databases is not None and snpeff_databases != "": 5291 log.debug(f"Create snpEff databases folder") 5292 if not os.path.exists(snpeff_databases): 5293 os.makedirs(snpeff_databases) 5294 5295 # Param 5296 param = self.get_param() 5297 log.debug("Param: " + str(param)) 5298 5299 # Param 5300 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5301 log.debug("Options: " + str(options)) 5302 5303 # Param - Assembly 5304 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5305 5306 # Param - Options 5307 snpeff_options = ( 5308 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5309 ) 5310 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5311 snpeff_csvstats = ( 5312 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5313 ) 5314 if snpeff_stats: 5315 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5316 snpeff_stats = full_path(snpeff_stats) 5317 snpeff_options += f" -stats {snpeff_stats}" 5318 if snpeff_csvstats: 5319 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5320 snpeff_csvstats = full_path(snpeff_csvstats) 5321 snpeff_options += f" -csvStats {snpeff_csvstats}" 5322 5323 # Data 5324 table_variants = self.get_table_variants() 5325 5326 # Check if not empty 5327 log.debug("Check if not empty") 5328 sql_query_chromosomes = ( 5329 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5330 ) 5331 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5332 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5333 log.info(f"VCF empty") 5334 return 5335 5336 # Export in VCF 5337 log.debug("Create initial file to annotate") 5338 tmp_vcf = NamedTemporaryFile( 5339 prefix=self.get_prefix(), 5340 dir=self.get_tmp_dir(), 5341 suffix=".vcf.gz", 5342 delete=True, 5343 ) 5344 tmp_vcf_name = tmp_vcf.name 5345 5346 # VCF header 5347 vcf_reader = self.get_header() 5348 log.debug("Initial header: " + str(vcf_reader.infos)) 5349 5350 # Existing annotations 5351 for vcf_annotation in self.get_header().infos: 5352 5353 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5354 log.debug( 5355 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5356 ) 5357 5358 # Memory limit 5359 # if config.get("memory", None): 5360 # memory_limit = config.get("memory", "8G") 5361 # else: 5362 # memory_limit = "8G" 5363 memory_limit = self.get_memory("8G") 5364 log.debug(f"memory_limit: {memory_limit}") 5365 5366 # snpEff java options 5367 snpeff_java_options = ( 5368 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5369 ) 5370 log.debug(f"Exomiser java options: {snpeff_java_options}") 5371 5372 force_update_annotation = True 5373 5374 if "ANN" not in self.get_header().infos or force_update_annotation: 5375 5376 # Check snpEff database 5377 log.debug(f"Check snpEff databases {[assembly]}") 5378 databases_download_snpeff( 5379 folder=snpeff_databases, assemblies=[assembly], config=config 5380 ) 5381 5382 # Export VCF file 5383 self.export_variant_vcf( 5384 vcf_file=tmp_vcf_name, 5385 remove_info=True, 5386 add_samples=False, 5387 index=True, 5388 ) 5389 5390 # Tmp file 5391 err_files = [] 5392 tmp_annotate_vcf = NamedTemporaryFile( 5393 prefix=self.get_prefix(), 5394 dir=self.get_tmp_dir(), 5395 suffix=".vcf", 5396 delete=False, 5397 ) 5398 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5399 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5400 err_files.append(tmp_annotate_vcf_name_err) 5401 5402 # Command 5403 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5404 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5405 run_parallel_commands([snpeff_command], 1) 5406 5407 # Error messages 5408 log.info(f"Error/Warning messages:") 5409 error_message_command_all = [] 5410 error_message_command_warning = [] 5411 error_message_command_err = [] 5412 for err_file in err_files: 5413 with open(err_file, "r") as f: 5414 for line in f: 5415 message = line.strip() 5416 error_message_command_all.append(message) 5417 if line.startswith("[W::"): 5418 error_message_command_warning.append(message) 5419 if line.startswith("[E::"): 5420 error_message_command_err.append(f"{err_file}: " + message) 5421 # log info 5422 for message in list( 5423 set(error_message_command_err + error_message_command_warning) 5424 ): 5425 log.info(f" {message}") 5426 # debug info 5427 for message in list(set(error_message_command_all)): 5428 log.debug(f" {message}") 5429 # failed 5430 if len(error_message_command_err): 5431 log.error("Annotation failed: Error in commands") 5432 raise ValueError("Annotation failed: Error in commands") 5433 5434 # Find annotation in header 5435 with open(tmp_annotate_vcf_name, "rt") as f: 5436 header_list = self.read_vcf_header(f) 5437 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5438 5439 for ann in annovar_vcf_header.infos: 5440 if ann not in self.get_header().infos: 5441 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5442 5443 # Update variants 5444 log.info(f"Annotation - Updating...") 5445 self.update_from_vcf(tmp_annotate_vcf_name) 5446 5447 else: 5448 if "ANN" in self.get_header().infos: 5449 log.debug(f"Existing snpEff annotations in VCF") 5450 if force_update_annotation: 5451 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5453 def annotation_annovar(self, threads: int = None) -> None: 5454 """ 5455 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5456 annotations 5457 5458 :param threads: number of threads to use 5459 :return: the value of the variable "return_value". 5460 """ 5461 5462 # DEBUG 5463 log.debug("Start annotation with Annovar databases") 5464 5465 # Threads 5466 if not threads: 5467 threads = self.get_threads() 5468 log.debug("Threads: " + str(threads)) 5469 5470 # Tmp en Err files 5471 tmp_files = [] 5472 err_files = [] 5473 5474 # DEBUG 5475 delete_tmp = True 5476 if self.get_config().get("verbosity", "warning") in ["debug"]: 5477 delete_tmp = False 5478 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5479 5480 # Config 5481 config = self.get_config() 5482 log.debug("Config: " + str(config)) 5483 5484 # Config - Folders - Databases 5485 databases_folders = ( 5486 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5487 ) 5488 log.debug("Databases annotations: " + str(databases_folders)) 5489 5490 # Config - annovar bin command 5491 annovar_bin_command = get_bin_command( 5492 bin="table_annovar.pl", 5493 tool="annovar", 5494 bin_type="perl", 5495 config=config, 5496 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5497 ) 5498 if not annovar_bin_command: 5499 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5500 log.error(msg_err) 5501 raise ValueError(msg_err) 5502 5503 # Config - BCFTools bin command 5504 bcftools_bin_command = get_bin_command( 5505 bin="bcftools", 5506 tool="bcftools", 5507 bin_type="bin", 5508 config=config, 5509 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5510 ) 5511 if not bcftools_bin_command: 5512 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5513 log.error(msg_err) 5514 raise ValueError(msg_err) 5515 5516 # Config - annovar databases 5517 annovar_databases = ( 5518 config.get("folders", {}) 5519 .get("databases", {}) 5520 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5521 ) 5522 if annovar_databases is not None: 5523 if isinstance(annovar_databases, list): 5524 annovar_databases = full_path(annovar_databases[0]) 5525 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5526 annovar_databases = full_path(annovar_databases) 5527 if not os.path.exists(annovar_databases): 5528 log.info(f"Annovar databases folder '{annovar_databases}' created") 5529 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5530 else: 5531 msg_err = f"Annovar databases configuration failed" 5532 log.error(msg_err) 5533 raise ValueError(msg_err) 5534 5535 # Param 5536 param = self.get_param() 5537 log.debug("Param: " + str(param)) 5538 5539 # Param - options 5540 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5541 log.debug("Options: " + str(options)) 5542 5543 # Param - annotations 5544 annotations = ( 5545 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5546 ) 5547 log.debug("Annotations: " + str(annotations)) 5548 5549 # Param - Assembly 5550 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5551 5552 # Annovar database assembly 5553 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5554 if annovar_databases_assembly != "" and not os.path.exists( 5555 annovar_databases_assembly 5556 ): 5557 os.makedirs(annovar_databases_assembly) 5558 5559 # Data 5560 table_variants = self.get_table_variants() 5561 5562 # Check if not empty 5563 log.debug("Check if not empty") 5564 sql_query_chromosomes = ( 5565 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5566 ) 5567 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5568 if not sql_query_chromosomes_df["count"][0]: 5569 log.info(f"VCF empty") 5570 return 5571 5572 # VCF header 5573 vcf_reader = self.get_header() 5574 log.debug("Initial header: " + str(vcf_reader.infos)) 5575 5576 # Existing annotations 5577 for vcf_annotation in self.get_header().infos: 5578 5579 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5580 log.debug( 5581 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5582 ) 5583 5584 force_update_annotation = True 5585 5586 if annotations: 5587 5588 commands = [] 5589 tmp_annotates_vcf_name_list = [] 5590 5591 # Export in VCF 5592 log.debug("Create initial file to annotate") 5593 tmp_vcf = NamedTemporaryFile( 5594 prefix=self.get_prefix(), 5595 dir=self.get_tmp_dir(), 5596 suffix=".vcf.gz", 5597 delete=False, 5598 ) 5599 tmp_vcf_name = tmp_vcf.name 5600 tmp_files.append(tmp_vcf_name) 5601 tmp_files.append(tmp_vcf_name + ".tbi") 5602 5603 # Export VCF file 5604 self.export_variant_vcf( 5605 vcf_file=tmp_vcf_name, 5606 remove_info=".", 5607 add_samples=False, 5608 index=True, 5609 ) 5610 5611 # Create file for field rename 5612 log.debug("Create file for field rename") 5613 tmp_rename = NamedTemporaryFile( 5614 prefix=self.get_prefix(), 5615 dir=self.get_tmp_dir(), 5616 suffix=".rename", 5617 delete=False, 5618 ) 5619 tmp_rename_name = tmp_rename.name 5620 tmp_files.append(tmp_rename_name) 5621 5622 # Check Annovar database 5623 log.debug( 5624 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5625 ) 5626 databases_download_annovar( 5627 folder=annovar_databases, 5628 files=list(annotations.keys()), 5629 assemblies=[assembly], 5630 ) 5631 5632 for annotation in annotations: 5633 annotation_fields = annotations[annotation] 5634 5635 if not annotation_fields: 5636 annotation_fields = {"INFO": None} 5637 5638 log.info(f"Annotations Annovar - database '{annotation}'") 5639 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5640 5641 # Tmp file for annovar 5642 err_files = [] 5643 tmp_annotate_vcf_directory = TemporaryDirectory( 5644 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5645 ) 5646 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5647 tmp_annotate_vcf_name_annovar = ( 5648 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5649 ) 5650 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5651 err_files.append(tmp_annotate_vcf_name_err) 5652 tmp_files.append(tmp_annotate_vcf_name_err) 5653 5654 # Tmp file final vcf annotated by annovar 5655 tmp_annotate_vcf = NamedTemporaryFile( 5656 prefix=self.get_prefix(), 5657 dir=self.get_tmp_dir(), 5658 suffix=".vcf.gz", 5659 delete=False, 5660 ) 5661 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5662 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5663 tmp_files.append(tmp_annotate_vcf_name) 5664 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5665 5666 # Number of fields 5667 annotation_list = [] 5668 annotation_renamed_list = [] 5669 5670 for annotation_field in annotation_fields: 5671 5672 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5673 annotation_fields_new_name = annotation_fields.get( 5674 annotation_field, annotation_field 5675 ) 5676 if not annotation_fields_new_name: 5677 annotation_fields_new_name = annotation_field 5678 5679 if ( 5680 force_update_annotation 5681 or annotation_fields_new_name not in self.get_header().infos 5682 ): 5683 annotation_list.append(annotation_field) 5684 annotation_renamed_list.append(annotation_fields_new_name) 5685 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5686 log.warning( 5687 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5688 ) 5689 5690 # Add rename info 5691 run_parallel_commands( 5692 [ 5693 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5694 ], 5695 1, 5696 ) 5697 5698 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5699 log.debug("annotation_list: " + str(annotation_list)) 5700 5701 # protocol 5702 protocol = annotation 5703 5704 # argument 5705 argument = "" 5706 5707 # operation 5708 operation = "f" 5709 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5710 "ensGene" 5711 ): 5712 operation = "g" 5713 if options.get("genebase", None): 5714 argument = f"""'{options.get("genebase","")}'""" 5715 elif annotation in ["cytoBand"]: 5716 operation = "r" 5717 5718 # argument option 5719 argument_option = "" 5720 if argument != "": 5721 argument_option = " --argument " + argument 5722 5723 # command options 5724 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5725 for option in options: 5726 if option not in ["genebase"]: 5727 command_options += f""" --{option}={options[option]}""" 5728 5729 # Command 5730 5731 # Command - Annovar 5732 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5733 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5734 5735 # Command - start pipe 5736 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5737 5738 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5739 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5740 5741 # Command - Special characters (refGene annotation) 5742 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5743 5744 # Command - Clean empty fields (with value ".") 5745 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5746 5747 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5748 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5749 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5750 # for ann in annotation_renamed_list: 5751 for ann in annotation_list: 5752 annovar_fields_to_keep.append(f"^INFO/{ann}") 5753 5754 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5755 5756 # Command - indexing 5757 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5758 5759 log.debug(f"Annotation - Annovar command: {command_annovar}") 5760 run_parallel_commands([command_annovar], 1) 5761 5762 # Error messages 5763 log.info(f"Error/Warning messages:") 5764 error_message_command_all = [] 5765 error_message_command_warning = [] 5766 error_message_command_err = [] 5767 for err_file in err_files: 5768 with open(err_file, "r") as f: 5769 for line in f: 5770 message = line.strip() 5771 error_message_command_all.append(message) 5772 if line.startswith("[W::") or line.startswith("WARNING"): 5773 error_message_command_warning.append(message) 5774 if line.startswith("[E::") or line.startswith("ERROR"): 5775 error_message_command_err.append( 5776 f"{err_file}: " + message 5777 ) 5778 # log info 5779 for message in list( 5780 set(error_message_command_err + error_message_command_warning) 5781 ): 5782 log.info(f" {message}") 5783 # debug info 5784 for message in list(set(error_message_command_all)): 5785 log.debug(f" {message}") 5786 # failed 5787 if len(error_message_command_err): 5788 log.error("Annotation failed: Error in commands") 5789 raise ValueError("Annotation failed: Error in commands") 5790 5791 if tmp_annotates_vcf_name_list: 5792 5793 # List of annotated files 5794 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5795 5796 # Tmp file 5797 tmp_annotate_vcf = NamedTemporaryFile( 5798 prefix=self.get_prefix(), 5799 dir=self.get_tmp_dir(), 5800 suffix=".vcf.gz", 5801 delete=False, 5802 ) 5803 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5804 tmp_files.append(tmp_annotate_vcf_name) 5805 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5806 err_files.append(tmp_annotate_vcf_name_err) 5807 tmp_files.append(tmp_annotate_vcf_name_err) 5808 5809 # Command merge 5810 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5811 log.info( 5812 f"Annotation Annovar - Annotation merging " 5813 + str(len(tmp_annotates_vcf_name_list)) 5814 + " annotated files" 5815 ) 5816 log.debug(f"Annotation - merge command: {merge_command}") 5817 run_parallel_commands([merge_command], 1) 5818 5819 # Find annotation in header 5820 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5821 header_list = self.read_vcf_header(f) 5822 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5823 5824 for ann in annovar_vcf_header.infos: 5825 if ann not in self.get_header().infos: 5826 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5827 5828 # Update variants 5829 log.info(f"Annotation Annovar - Updating...") 5830 self.update_from_vcf(tmp_annotate_vcf_name) 5831 5832 # Clean files 5833 # Tmp file remove command 5834 if True: 5835 tmp_files_remove_command = "" 5836 if tmp_files: 5837 tmp_files_remove_command = " ".join(tmp_files) 5838 clean_command = f" rm -f {tmp_files_remove_command} " 5839 log.debug(f"Annotation Annovar - Annotation cleaning ") 5840 log.debug(f"Annotation - cleaning command: {clean_command}") 5841 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5844 def annotation_parquet(self, threads: int = None) -> None: 5845 """ 5846 It takes a VCF file, and annotates it with a parquet file 5847 5848 :param threads: number of threads to use for the annotation 5849 :return: the value of the variable "result". 5850 """ 5851 5852 # DEBUG 5853 log.debug("Start annotation with parquet databases") 5854 5855 # Threads 5856 if not threads: 5857 threads = self.get_threads() 5858 log.debug("Threads: " + str(threads)) 5859 5860 # DEBUG 5861 delete_tmp = True 5862 if self.get_config().get("verbosity", "warning") in ["debug"]: 5863 delete_tmp = False 5864 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5865 5866 # Config 5867 databases_folders = set( 5868 self.get_config() 5869 .get("folders", {}) 5870 .get("databases", {}) 5871 .get("annotations", ["."]) 5872 + self.get_config() 5873 .get("folders", {}) 5874 .get("databases", {}) 5875 .get("parquet", ["."]) 5876 ) 5877 log.debug("Databases annotations: " + str(databases_folders)) 5878 5879 # Param 5880 annotations = ( 5881 self.get_param() 5882 .get("annotation", {}) 5883 .get("parquet", {}) 5884 .get("annotations", None) 5885 ) 5886 log.debug("Annotations: " + str(annotations)) 5887 5888 # Assembly 5889 assembly = self.get_param().get( 5890 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5891 ) 5892 5893 # Force Update Annotation 5894 force_update_annotation = ( 5895 self.get_param() 5896 .get("annotation", {}) 5897 .get("options", {}) 5898 .get("annotations_update", False) 5899 ) 5900 log.debug(f"force_update_annotation={force_update_annotation}") 5901 force_append_annotation = ( 5902 self.get_param() 5903 .get("annotation", {}) 5904 .get("options", {}) 5905 .get("annotations_append", False) 5906 ) 5907 log.debug(f"force_append_annotation={force_append_annotation}") 5908 5909 # Data 5910 table_variants = self.get_table_variants() 5911 5912 # Check if not empty 5913 log.debug("Check if not empty") 5914 sql_query_chromosomes_df = self.get_query_to_df( 5915 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5916 ) 5917 if not sql_query_chromosomes_df["count"][0]: 5918 log.info(f"VCF empty") 5919 return 5920 5921 # VCF header 5922 vcf_reader = self.get_header() 5923 log.debug("Initial header: " + str(vcf_reader.infos)) 5924 5925 # Nb Variants POS 5926 log.debug("NB Variants Start") 5927 nb_variants = self.conn.execute( 5928 f"SELECT count(*) AS count FROM variants" 5929 ).fetchdf()["count"][0] 5930 log.debug("NB Variants Stop") 5931 5932 # Existing annotations 5933 for vcf_annotation in self.get_header().infos: 5934 5935 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5936 log.debug( 5937 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5938 ) 5939 5940 # Added columns 5941 added_columns = [] 5942 5943 # drop indexes 5944 log.debug(f"Drop indexes...") 5945 self.drop_indexes() 5946 5947 if annotations: 5948 5949 if "ALL" in annotations: 5950 5951 all_param = annotations.get("ALL", {}) 5952 all_param_formats = all_param.get("formats", None) 5953 all_param_releases = all_param.get("releases", None) 5954 5955 databases_infos_dict = self.scan_databases( 5956 database_formats=all_param_formats, 5957 database_releases=all_param_releases, 5958 ) 5959 for database_infos in databases_infos_dict.keys(): 5960 if database_infos not in annotations: 5961 annotations[database_infos] = {"INFO": None} 5962 5963 for annotation in annotations: 5964 5965 if annotation in ["ALL"]: 5966 continue 5967 5968 # Annotation Name 5969 annotation_name = os.path.basename(annotation) 5970 5971 # Annotation fields 5972 annotation_fields = annotations[annotation] 5973 if not annotation_fields: 5974 annotation_fields = {"INFO": None} 5975 5976 log.debug(f"Annotation '{annotation_name}'") 5977 log.debug( 5978 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5979 ) 5980 5981 # Create Database 5982 database = Database( 5983 database=annotation, 5984 databases_folders=databases_folders, 5985 assembly=assembly, 5986 ) 5987 5988 # Find files 5989 parquet_file = database.get_database() 5990 parquet_hdr_file = database.get_header_file() 5991 parquet_type = database.get_type() 5992 5993 # Check if files exists 5994 if not parquet_file or not parquet_hdr_file: 5995 msg_err_list = [] 5996 if not parquet_file: 5997 msg_err_list.append( 5998 f"Annotation failed: Annotation file not found" 5999 ) 6000 if parquet_file and not parquet_hdr_file: 6001 msg_err_list.append( 6002 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6003 ) 6004 6005 log.error(". ".join(msg_err_list)) 6006 raise ValueError(". ".join(msg_err_list)) 6007 else: 6008 # Get parquet connexion 6009 parquet_sql_attach = database.get_sql_database_attach( 6010 output="query" 6011 ) 6012 if parquet_sql_attach: 6013 self.conn.execute(parquet_sql_attach) 6014 parquet_file_link = database.get_sql_database_link() 6015 # Log 6016 log.debug( 6017 f"Annotation '{annotation_name}' - file: " 6018 + str(parquet_file) 6019 + " and " 6020 + str(parquet_hdr_file) 6021 ) 6022 6023 # Database full header columns 6024 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6025 parquet_hdr_file 6026 ) 6027 # Log 6028 log.debug( 6029 "Annotation database header columns : " 6030 + str(parquet_hdr_vcf_header_columns) 6031 ) 6032 6033 # Load header as VCF object 6034 parquet_hdr_vcf_header_infos = database.get_header().infos 6035 # Log 6036 log.debug( 6037 "Annotation database header: " 6038 + str(parquet_hdr_vcf_header_infos) 6039 ) 6040 6041 # Get extra infos 6042 parquet_columns = database.get_extra_columns() 6043 # Log 6044 log.debug("Annotation database Columns: " + str(parquet_columns)) 6045 6046 # Add extra columns if "ALL" in annotation_fields 6047 # if "ALL" in annotation_fields: 6048 # allow_add_extra_column = True 6049 if "ALL" in annotation_fields and database.get_extra_columns(): 6050 for extra_column in database.get_extra_columns(): 6051 if ( 6052 extra_column not in annotation_fields 6053 and extra_column.replace("INFO/", "") 6054 not in parquet_hdr_vcf_header_infos 6055 ): 6056 parquet_hdr_vcf_header_infos[extra_column] = ( 6057 vcf.parser._Info( 6058 extra_column, 6059 ".", 6060 "String", 6061 f"{extra_column} description", 6062 "unknown", 6063 "unknown", 6064 self.code_type_map["String"], 6065 ) 6066 ) 6067 6068 # For all fields in database 6069 annotation_fields_all = False 6070 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6071 annotation_fields_all = True 6072 annotation_fields = { 6073 key: key for key in parquet_hdr_vcf_header_infos 6074 } 6075 6076 log.debug( 6077 "Annotation database header - All annotations added: " 6078 + str(annotation_fields) 6079 ) 6080 6081 # Init 6082 6083 # List of annotation fields to use 6084 sql_query_annotation_update_info_sets = [] 6085 6086 # List of annotation to agregate 6087 sql_query_annotation_to_agregate = [] 6088 6089 # Number of fields 6090 nb_annotation_field = 0 6091 6092 # Annotation fields processed 6093 annotation_fields_processed = [] 6094 6095 # Columns mapping 6096 map_columns = database.map_columns( 6097 columns=annotation_fields, prefixes=["INFO/"] 6098 ) 6099 6100 # Query dict for fields to remove (update option) 6101 query_dict_remove = {} 6102 6103 # Fetch Anotation fields 6104 for annotation_field in annotation_fields: 6105 6106 # annotation_field_column 6107 annotation_field_column = map_columns.get( 6108 annotation_field, "INFO" 6109 ) 6110 6111 # field new name, if parametered 6112 annotation_fields_new_name = annotation_fields.get( 6113 annotation_field, annotation_field 6114 ) 6115 if not annotation_fields_new_name: 6116 annotation_fields_new_name = annotation_field 6117 6118 # To annotate 6119 # force_update_annotation = True 6120 # force_append_annotation = True 6121 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6122 if annotation_field in parquet_hdr_vcf_header_infos and ( 6123 force_update_annotation 6124 or force_append_annotation 6125 or ( 6126 annotation_fields_new_name 6127 not in self.get_header().infos 6128 ) 6129 ): 6130 6131 # Add field to annotation to process list 6132 annotation_fields_processed.append( 6133 annotation_fields_new_name 6134 ) 6135 6136 # explode infos for the field 6137 annotation_fields_new_name_info_msg = "" 6138 if ( 6139 force_update_annotation 6140 and annotation_fields_new_name 6141 in self.get_header().infos 6142 ): 6143 # Remove field from INFO 6144 query = f""" 6145 UPDATE {table_variants} as table_variants 6146 SET INFO = REGEXP_REPLACE( 6147 concat(table_variants.INFO,''), 6148 ';*{annotation_fields_new_name}=[^;]*', 6149 '' 6150 ) 6151 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6152 """ 6153 annotation_fields_new_name_info_msg = " [update]" 6154 query_dict_remove[ 6155 f"remove 'INFO/{annotation_fields_new_name}'" 6156 ] = query 6157 6158 # Sep between fields in INFO 6159 nb_annotation_field += 1 6160 if nb_annotation_field > 1: 6161 annotation_field_sep = ";" 6162 else: 6163 annotation_field_sep = "" 6164 6165 log.info( 6166 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6167 ) 6168 6169 # Add INFO field to header 6170 parquet_hdr_vcf_header_infos_number = ( 6171 parquet_hdr_vcf_header_infos[annotation_field].num 6172 or "." 6173 ) 6174 parquet_hdr_vcf_header_infos_type = ( 6175 parquet_hdr_vcf_header_infos[annotation_field].type 6176 or "String" 6177 ) 6178 parquet_hdr_vcf_header_infos_description = ( 6179 parquet_hdr_vcf_header_infos[annotation_field].desc 6180 or f"{annotation_field} description" 6181 ) 6182 parquet_hdr_vcf_header_infos_source = ( 6183 parquet_hdr_vcf_header_infos[annotation_field].source 6184 or "unknown" 6185 ) 6186 parquet_hdr_vcf_header_infos_version = ( 6187 parquet_hdr_vcf_header_infos[annotation_field].version 6188 or "unknown" 6189 ) 6190 6191 vcf_reader.infos[annotation_fields_new_name] = ( 6192 vcf.parser._Info( 6193 annotation_fields_new_name, 6194 parquet_hdr_vcf_header_infos_number, 6195 parquet_hdr_vcf_header_infos_type, 6196 parquet_hdr_vcf_header_infos_description, 6197 parquet_hdr_vcf_header_infos_source, 6198 parquet_hdr_vcf_header_infos_version, 6199 self.code_type_map[ 6200 parquet_hdr_vcf_header_infos_type 6201 ], 6202 ) 6203 ) 6204 6205 # Append 6206 if force_append_annotation: 6207 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6208 else: 6209 query_case_when_append = "" 6210 6211 # Annotation/Update query fields 6212 # Found in INFO column 6213 if ( 6214 annotation_field_column == "INFO" 6215 and "INFO" in parquet_hdr_vcf_header_columns 6216 ): 6217 sql_query_annotation_update_info_sets.append( 6218 f""" 6219 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6220 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6221 ELSE '' 6222 END 6223 """ 6224 ) 6225 # Found in a specific column 6226 else: 6227 sql_query_annotation_update_info_sets.append( 6228 f""" 6229 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6230 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6231 ELSE '' 6232 END 6233 """ 6234 ) 6235 sql_query_annotation_to_agregate.append( 6236 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6237 ) 6238 6239 # Not to annotate 6240 else: 6241 6242 if force_update_annotation: 6243 annotation_message = "forced" 6244 else: 6245 annotation_message = "skipped" 6246 6247 if annotation_field not in parquet_hdr_vcf_header_infos: 6248 log.warning( 6249 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6250 ) 6251 if annotation_fields_new_name in self.get_header().infos: 6252 log.warning( 6253 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6254 ) 6255 6256 # Check if ALL fields have to be annotated. Thus concat all INFO field 6257 # allow_annotation_full_info = True 6258 allow_annotation_full_info = not force_append_annotation 6259 6260 if parquet_type in ["regions"]: 6261 allow_annotation_full_info = False 6262 6263 if ( 6264 allow_annotation_full_info 6265 and nb_annotation_field == len(annotation_fields) 6266 and annotation_fields_all 6267 and ( 6268 "INFO" in parquet_hdr_vcf_header_columns 6269 and "INFO" in database.get_extra_columns() 6270 ) 6271 ): 6272 log.debug("Column INFO annotation enabled") 6273 sql_query_annotation_update_info_sets = [] 6274 sql_query_annotation_update_info_sets.append( 6275 f" table_parquet.INFO " 6276 ) 6277 6278 if sql_query_annotation_update_info_sets: 6279 6280 # Annotate 6281 log.info(f"Annotation '{annotation_name}' - Annotation...") 6282 6283 # Join query annotation update info sets for SQL 6284 sql_query_annotation_update_info_sets_sql = ",".join( 6285 sql_query_annotation_update_info_sets 6286 ) 6287 6288 # Check chromosomes list (and variants infos) 6289 sql_query_chromosomes = f""" 6290 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6291 FROM {table_variants} as table_variants 6292 GROUP BY table_variants."#CHROM" 6293 ORDER BY table_variants."#CHROM" 6294 """ 6295 sql_query_chromosomes_df = self.conn.execute( 6296 sql_query_chromosomes 6297 ).df() 6298 sql_query_chromosomes_dict = { 6299 entry["CHROM"]: { 6300 "count": entry["count_variants"], 6301 "min": entry["min_variants"], 6302 "max": entry["max_variants"], 6303 } 6304 for index, entry in sql_query_chromosomes_df.iterrows() 6305 } 6306 6307 # Init 6308 nb_of_query = 0 6309 nb_of_variant_annotated = 0 6310 query_dict = query_dict_remove 6311 6312 # for chrom in sql_query_chromosomes_df["CHROM"]: 6313 for chrom in sql_query_chromosomes_dict: 6314 6315 # Number of variant by chromosome 6316 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6317 chrom, {} 6318 ).get("count", 0) 6319 6320 log.debug( 6321 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6322 ) 6323 6324 # Annotation with regions database 6325 if parquet_type in ["regions"]: 6326 sql_query_annotation_from_clause = f""" 6327 FROM ( 6328 SELECT 6329 '{chrom}' AS \"#CHROM\", 6330 table_variants_from.\"POS\" AS \"POS\", 6331 {",".join(sql_query_annotation_to_agregate)} 6332 FROM {table_variants} as table_variants_from 6333 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6334 table_parquet_from."#CHROM" = '{chrom}' 6335 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6336 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6337 ) 6338 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6339 GROUP BY table_variants_from.\"POS\" 6340 ) 6341 as table_parquet 6342 """ 6343 6344 sql_query_annotation_where_clause = """ 6345 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6346 AND table_parquet.\"POS\" = table_variants.\"POS\" 6347 """ 6348 6349 # Annotation with variants database 6350 else: 6351 sql_query_annotation_from_clause = f""" 6352 FROM {parquet_file_link} as table_parquet 6353 """ 6354 sql_query_annotation_where_clause = f""" 6355 table_variants."#CHROM" = '{chrom}' 6356 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6357 AND table_parquet.\"POS\" = table_variants.\"POS\" 6358 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6359 AND table_parquet.\"REF\" = table_variants.\"REF\" 6360 """ 6361 6362 # Create update query 6363 sql_query_annotation_chrom_interval_pos = f""" 6364 UPDATE {table_variants} as table_variants 6365 SET INFO = 6366 concat( 6367 CASE WHEN table_variants.INFO NOT IN ('','.') 6368 THEN table_variants.INFO 6369 ELSE '' 6370 END 6371 , 6372 CASE WHEN table_variants.INFO NOT IN ('','.') 6373 AND ( 6374 concat({sql_query_annotation_update_info_sets_sql}) 6375 ) 6376 NOT IN ('','.') 6377 THEN ';' 6378 ELSE '' 6379 END 6380 , 6381 {sql_query_annotation_update_info_sets_sql} 6382 ) 6383 {sql_query_annotation_from_clause} 6384 WHERE {sql_query_annotation_where_clause} 6385 ; 6386 """ 6387 6388 # Add update query to dict 6389 query_dict[ 6390 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6391 ] = sql_query_annotation_chrom_interval_pos 6392 6393 nb_of_query = len(query_dict) 6394 num_query = 0 6395 6396 # SET max_expression_depth TO x 6397 self.conn.execute("SET max_expression_depth TO 10000") 6398 6399 for query_name in query_dict: 6400 query = query_dict[query_name] 6401 num_query += 1 6402 log.info( 6403 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6404 ) 6405 result = self.conn.execute(query) 6406 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6407 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6408 log.info( 6409 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6410 ) 6411 6412 log.info( 6413 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6414 ) 6415 6416 else: 6417 6418 log.info( 6419 f"Annotation '{annotation_name}' - No Annotations available" 6420 ) 6421 6422 log.debug("Final header: " + str(vcf_reader.infos)) 6423 6424 # Remove added columns 6425 for added_column in added_columns: 6426 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6428 def annotation_splice(self, threads: int = None) -> None: 6429 """ 6430 This function annotate with snpEff 6431 6432 :param threads: The number of threads to use 6433 :return: the value of the variable "return_value". 6434 """ 6435 6436 # DEBUG 6437 log.debug("Start annotation with splice tools") 6438 6439 # Threads 6440 if not threads: 6441 threads = self.get_threads() 6442 log.debug("Threads: " + str(threads)) 6443 6444 # DEBUG 6445 delete_tmp = True 6446 if self.get_config().get("verbosity", "warning") in ["debug"]: 6447 delete_tmp = False 6448 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6449 6450 # Config 6451 config = self.get_config() 6452 log.debug("Config: " + str(config)) 6453 splice_config = config.get("tools", {}).get("splice", {}) 6454 if not splice_config: 6455 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6456 msg_err = "No Splice tool config" 6457 raise ValueError(msg_err) 6458 log.debug(f"splice_config: {splice_config}") 6459 6460 # Config - Folders - Databases 6461 databases_folders = ( 6462 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6463 ) 6464 log.debug("Databases annotations: " + str(databases_folders)) 6465 6466 # Splice docker image 6467 splice_docker_image = splice_config.get("docker").get("image") 6468 6469 # Pull splice image if it's not already there 6470 if not check_docker_image_exists(splice_docker_image): 6471 log.warning( 6472 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6473 ) 6474 try: 6475 command(f"docker pull {splice_config.get('docker').get('image')}") 6476 except subprocess.CalledProcessError: 6477 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6478 log.error(msg_err) 6479 raise ValueError(msg_err) 6480 6481 # Config - splice databases 6482 splice_databases = ( 6483 config.get("folders", {}) 6484 .get("databases", {}) 6485 .get("splice", DEFAULT_SPLICE_FOLDER) 6486 ) 6487 splice_databases = full_path(splice_databases) 6488 6489 # Param 6490 param = self.get_param() 6491 log.debug("Param: " + str(param)) 6492 6493 # Param 6494 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6495 log.debug("Options: " + str(options)) 6496 6497 # Data 6498 table_variants = self.get_table_variants() 6499 6500 # Check if not empty 6501 log.debug("Check if not empty") 6502 sql_query_chromosomes = ( 6503 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6504 ) 6505 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6506 log.info("VCF empty") 6507 return None 6508 6509 # Export in VCF 6510 log.debug("Create initial file to annotate") 6511 6512 # Create output folder / work folder 6513 if options.get("output_folder", ""): 6514 output_folder = options.get("output_folder", "") 6515 if not os.path.exists(output_folder): 6516 Path(output_folder).mkdir(parents=True, exist_ok=True) 6517 else: 6518 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6519 if not os.path.exists(output_folder): 6520 Path(output_folder).mkdir(parents=True, exist_ok=True) 6521 6522 if options.get("workdir", ""): 6523 workdir = options.get("workdir", "") 6524 else: 6525 workdir = "/work" 6526 6527 # Create tmp VCF file 6528 tmp_vcf = NamedTemporaryFile( 6529 prefix=self.get_prefix(), 6530 dir=output_folder, 6531 suffix=".vcf", 6532 delete=False, 6533 ) 6534 tmp_vcf_name = tmp_vcf.name 6535 6536 # VCF header 6537 header = self.get_header() 6538 6539 # Existing annotations 6540 for vcf_annotation in self.get_header().infos: 6541 6542 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6543 log.debug( 6544 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6545 ) 6546 6547 # Memory limit 6548 if config.get("memory", None): 6549 memory_limit = config.get("memory", "8G").upper() 6550 # upper() 6551 else: 6552 memory_limit = "8G" 6553 log.debug(f"memory_limit: {memory_limit}") 6554 6555 # Check number of variants to annotate 6556 where_clause_regex_spliceai = r"SpliceAI_\w+" 6557 where_clause_regex_spip = r"SPiP_\w+" 6558 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6559 df_list_of_variants_to_annotate = self.get_query_to_df( 6560 query=f""" SELECT * FROM variants {where_clause} """ 6561 ) 6562 if len(df_list_of_variants_to_annotate) == 0: 6563 log.warning( 6564 f"No variants to annotate with splice. Variants probably already annotated with splice" 6565 ) 6566 return None 6567 else: 6568 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6569 6570 # Export VCF file 6571 self.export_variant_vcf( 6572 vcf_file=tmp_vcf_name, 6573 remove_info=True, 6574 add_samples=True, 6575 index=False, 6576 where_clause=where_clause, 6577 ) 6578 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6579 if any(value for value in splice_config.values() if value is None): 6580 log.warning("At least one splice config parameter is empty") 6581 # exit annotation_splice 6582 return None 6583 6584 # Params in splice nf 6585 def check_values(dico: dict): 6586 """ 6587 Ensure parameters for NF splice pipeline 6588 """ 6589 for key, val in dico.items(): 6590 if key == "genome": 6591 if any( 6592 assemb in options.get("genome", {}) 6593 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6594 ): 6595 yield f"--{key} hg19" 6596 elif any( 6597 assemb in options.get("genome", {}) 6598 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6599 ): 6600 yield f"--{key} hg38" 6601 elif ( 6602 (isinstance(val, str) and val) 6603 or isinstance(val, int) 6604 or isinstance(val, bool) 6605 ): 6606 yield f"--{key} {val}" 6607 6608 # Genome 6609 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6610 options["genome"] = genome 6611 # NF params 6612 nf_params = [] 6613 # Add options 6614 if options: 6615 log.debug(options) 6616 nf_params = list(check_values(options)) 6617 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6618 else: 6619 log.debug("No NF params provided") 6620 # Add threads 6621 if "threads" not in options.keys(): 6622 nf_params.append(f"--threads {threads}") 6623 # Genome path 6624 genome_path = find_genome( 6625 config.get("folders", {}) 6626 .get("databases", {}) 6627 .get("genomes", DEFAULT_GENOME_FOLDER), 6628 file=f"{genome}.fa", 6629 ) 6630 # Add genome path 6631 if not genome_path: 6632 raise ValueError( 6633 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6634 ) 6635 else: 6636 log.debug(f"Genome: {genome_path}") 6637 nf_params.append(f"--genome_path {genome_path}") 6638 6639 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6640 """ 6641 Setting up updated databases for SPiP and SpliceAI 6642 """ 6643 6644 try: 6645 6646 # SpliceAI assembly transcriptome 6647 spliceai_assembly = os.path.join( 6648 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6649 options.get("genome"), 6650 "transcriptome", 6651 ) 6652 spip_assembly = options.get("genome") 6653 6654 spip = find( 6655 f"transcriptome_{spip_assembly}.RData", 6656 config.get("folders", {}).get("databases", {}).get("spip", {}), 6657 ) 6658 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6659 log.debug(f"SPiP annotations: {spip}") 6660 log.debug(f"SpliceAI annotations: {spliceai}") 6661 if spip and spliceai: 6662 return [ 6663 f"--spip_transcriptome {spip}", 6664 f"--spliceai_transcriptome {spliceai}", 6665 ] 6666 else: 6667 log.warning( 6668 "Can't find splice databases in configuration, use annotations file from image" 6669 ) 6670 except TypeError: 6671 log.warning( 6672 "Can't find splice databases in configuration, use annotations file from image" 6673 ) 6674 return [] 6675 6676 # Add options, check if transcriptome option have already beend provided 6677 if ( 6678 "spip_transcriptome" not in nf_params 6679 and "spliceai_transcriptome" not in nf_params 6680 ): 6681 splice_reference = splice_annotations(options, config) 6682 if splice_reference: 6683 nf_params.extend(splice_reference) 6684 # nf_params.append(f"--output_folder {output_folder}") 6685 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6686 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6687 log.debug(cmd) 6688 splice_config["docker"]["command"] = cmd 6689 6690 # Ensure proxy is set 6691 proxy = [ 6692 f"-e {var}={os.getenv(var)}" 6693 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6694 if os.getenv(var) is not None 6695 ] 6696 docker_cmd = get_bin_command( 6697 tool="splice", 6698 bin_type="docker", 6699 config=config, 6700 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6701 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6702 ) 6703 # print(docker_cmd) 6704 # exit() 6705 # Docker debug 6706 # if splice_config.get("rm_container"): 6707 # rm_container = "--rm" 6708 # else: 6709 # rm_container = "" 6710 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6711 log.debug(docker_cmd) 6712 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6713 log.debug(res.stdout) 6714 if res.stderr: 6715 log.error(res.stderr) 6716 res.check_returncode() 6717 # Update variants 6718 log.info("Annotation - Updating...") 6719 # Test find output vcf 6720 log.debug( 6721 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6722 ) 6723 output_vcf = [] 6724 # Wrong folder to look in 6725 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6726 if ( 6727 files 6728 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6729 ): 6730 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6731 # log.debug(os.listdir(options.get("output_folder"))) 6732 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6733 if not output_vcf: 6734 log.debug( 6735 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6736 ) 6737 else: 6738 # Get new header from annotated vcf 6739 log.debug(f"Initial header: {len(header.infos)} fields") 6740 # Create new header with splice infos 6741 new_vcf = Variants(input=output_vcf[0]) 6742 new_vcf_header = new_vcf.get_header().infos 6743 for keys, infos in new_vcf_header.items(): 6744 if keys not in header.infos.keys(): 6745 header.infos[keys] = infos 6746 log.debug(f"New header: {len(header.infos)} fields") 6747 log.debug(f"Splice tmp output: {output_vcf[0]}") 6748 self.update_from_vcf(output_vcf[0]) 6749 6750 # Remove file 6751 remove_if_exists(output_vcf)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6757 def get_config_default(self, name: str) -> dict: 6758 """ 6759 The function `get_config_default` returns a dictionary containing default configurations for 6760 various calculations and prioritizations. 6761 6762 :param name: The `get_config_default` function returns a dictionary containing default 6763 configurations for different calculations and prioritizations. The `name` parameter is used to 6764 specify which specific configuration to retrieve from the dictionary 6765 :type name: str 6766 :return: The function `get_config_default` returns a dictionary containing default configuration 6767 settings for different calculations and prioritizations. The specific configuration settings are 6768 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6769 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6770 returned. If there is no match, an empty dictionary is returned. 6771 """ 6772 6773 config_default = { 6774 "calculations": { 6775 "variant_chr_pos_alt_ref": { 6776 "type": "sql", 6777 "name": "variant_chr_pos_alt_ref", 6778 "description": "Create a variant ID with chromosome, position, alt and ref", 6779 "available": False, 6780 "output_column_name": "variant_chr_pos_alt_ref", 6781 "output_column_type": "String", 6782 "output_column_description": "variant ID with chromosome, position, alt and ref", 6783 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6784 "operation_info": True, 6785 }, 6786 "VARTYPE": { 6787 "type": "sql", 6788 "name": "VARTYPE", 6789 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6790 "available": True, 6791 "table": "variants", 6792 "output_column_name": "VARTYPE", 6793 "output_column_type": "String", 6794 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6795 "operation_query": """ 6796 CASE 6797 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6798 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6799 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6800 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6801 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6802 ELSE 'UNDEFINED' 6803 END 6804 """, 6805 "info_fields": ["SVTYPE"], 6806 "operation_info": True, 6807 }, 6808 "snpeff_hgvs": { 6809 "type": "python", 6810 "name": "snpeff_hgvs", 6811 "description": "HGVS nomenclatures from snpEff annotation", 6812 "available": True, 6813 "function_name": "calculation_extract_snpeff_hgvs", 6814 "function_params": ["snpeff_hgvs", "ANN"], 6815 }, 6816 "snpeff_ann_explode": { 6817 "type": "python", 6818 "name": "snpeff_ann_explode", 6819 "description": "Explode snpEff annotations with uniquify values", 6820 "available": True, 6821 "function_name": "calculation_snpeff_ann_explode", 6822 "function_params": [False, "fields", "snpeff_", "ANN"], 6823 }, 6824 "snpeff_ann_explode_uniquify": { 6825 "type": "python", 6826 "name": "snpeff_ann_explode_uniquify", 6827 "description": "Explode snpEff annotations", 6828 "available": True, 6829 "function_name": "calculation_snpeff_ann_explode", 6830 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6831 }, 6832 "snpeff_ann_explode_json": { 6833 "type": "python", 6834 "name": "snpeff_ann_explode_json", 6835 "description": "Explode snpEff annotations in JSON format", 6836 "available": True, 6837 "function_name": "calculation_snpeff_ann_explode", 6838 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6839 }, 6840 "NOMEN": { 6841 "type": "python", 6842 "name": "NOMEN", 6843 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6844 "available": True, 6845 "function_name": "calculation_extract_nomen", 6846 "function_params": [], 6847 }, 6848 "RENAME_INFO_FIELDS": { 6849 "type": "python", 6850 "name": "RENAME_INFO_FIELDS", 6851 "description": "Rename or remove INFO/tags", 6852 "available": True, 6853 "function_name": "calculation_rename_info_fields", 6854 "function_params": [], 6855 }, 6856 "FINDBYPIPELINE": { 6857 "type": "python", 6858 "name": "FINDBYPIPELINE", 6859 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6860 "available": True, 6861 "function_name": "calculation_find_by_pipeline", 6862 "function_params": ["findbypipeline"], 6863 }, 6864 "FINDBYSAMPLE": { 6865 "type": "python", 6866 "name": "FINDBYSAMPLE", 6867 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6868 "available": True, 6869 "function_name": "calculation_find_by_pipeline", 6870 "function_params": ["findbysample"], 6871 }, 6872 "GENOTYPECONCORDANCE": { 6873 "type": "python", 6874 "name": "GENOTYPECONCORDANCE", 6875 "description": "Concordance of genotype for multi caller VCF", 6876 "available": True, 6877 "function_name": "calculation_genotype_concordance", 6878 "function_params": [], 6879 }, 6880 "BARCODE": { 6881 "type": "python", 6882 "name": "BARCODE", 6883 "description": "BARCODE as VaRank tool", 6884 "available": True, 6885 "function_name": "calculation_barcode", 6886 "function_params": [], 6887 }, 6888 "BARCODEFAMILY": { 6889 "type": "python", 6890 "name": "BARCODEFAMILY", 6891 "description": "BARCODEFAMILY as VaRank tool", 6892 "available": True, 6893 "function_name": "calculation_barcode_family", 6894 "function_params": ["BCF"], 6895 }, 6896 "TRIO": { 6897 "type": "python", 6898 "name": "TRIO", 6899 "description": "Inheritance for a trio family", 6900 "available": True, 6901 "function_name": "calculation_trio", 6902 "function_params": [], 6903 }, 6904 "VAF": { 6905 "type": "python", 6906 "name": "VAF", 6907 "description": "Variant Allele Frequency (VAF) harmonization", 6908 "available": True, 6909 "function_name": "calculation_vaf_normalization", 6910 "function_params": [], 6911 }, 6912 "VAF_stats": { 6913 "type": "python", 6914 "name": "VAF_stats", 6915 "description": "Variant Allele Frequency (VAF) statistics", 6916 "available": True, 6917 "function_name": "calculation_genotype_stats", 6918 "function_params": ["VAF"], 6919 }, 6920 "DP_stats": { 6921 "type": "python", 6922 "name": "DP_stats", 6923 "description": "Depth (DP) statistics", 6924 "available": True, 6925 "function_name": "calculation_genotype_stats", 6926 "function_params": ["DP"], 6927 }, 6928 "variant_id": { 6929 "type": "python", 6930 "name": "variant_id", 6931 "description": "Variant ID generated from variant position and type", 6932 "available": True, 6933 "function_name": "calculation_variant_id", 6934 "function_params": [], 6935 }, 6936 "transcripts_json": { 6937 "type": "python", 6938 "name": "transcripts_json", 6939 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6940 "available": True, 6941 "function_name": "calculation_transcripts_annotation", 6942 "function_params": ["transcripts_json", None], 6943 }, 6944 "transcripts_ann": { 6945 "type": "python", 6946 "name": "transcripts_ann", 6947 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6948 "available": True, 6949 "function_name": "calculation_transcripts_annotation", 6950 "function_params": [None, "transcripts_ann"], 6951 }, 6952 "transcripts_annotations": { 6953 "type": "python", 6954 "name": "transcripts_annotations", 6955 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6956 "available": True, 6957 "function_name": "calculation_transcripts_annotation", 6958 "function_params": [None, None], 6959 }, 6960 "transcripts_prioritization": { 6961 "type": "python", 6962 "name": "transcripts_prioritization", 6963 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6964 "available": True, 6965 "function_name": "calculation_transcripts_prioritization", 6966 "function_params": [], 6967 }, 6968 "transcripts_export": { 6969 "type": "python", 6970 "name": "transcripts_export", 6971 "description": "Export transcripts table/view as a file (using param.json)", 6972 "available": True, 6973 "function_name": "calculation_transcripts_export", 6974 "function_params": [], 6975 }, 6976 }, 6977 "prioritizations": { 6978 "default": { 6979 "ANN2": [ 6980 { 6981 "type": "contains", 6982 "value": "HIGH", 6983 "score": 5, 6984 "flag": "PASS", 6985 "comment": [ 6986 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6987 ], 6988 }, 6989 { 6990 "type": "contains", 6991 "value": "MODERATE", 6992 "score": 3, 6993 "flag": "PASS", 6994 "comment": [ 6995 "A non-disruptive variant that might change protein effectiveness" 6996 ], 6997 }, 6998 { 6999 "type": "contains", 7000 "value": "LOW", 7001 "score": 0, 7002 "flag": "FILTERED", 7003 "comment": [ 7004 "Assumed to be mostly harmless or unlikely to change protein behavior" 7005 ], 7006 }, 7007 { 7008 "type": "contains", 7009 "value": "MODIFIER", 7010 "score": 0, 7011 "flag": "FILTERED", 7012 "comment": [ 7013 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7014 ], 7015 }, 7016 ], 7017 } 7018 }, 7019 } 7020 7021 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
7023 def get_config_json( 7024 self, name: str, config_dict: dict = {}, config_file: str = None 7025 ) -> dict: 7026 """ 7027 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7028 default values, a dictionary, and a file. 7029 7030 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7031 the name of the configuration. It is used to identify and retrieve the configuration settings 7032 for a specific component or module 7033 :type name: str 7034 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7035 dictionary that allows you to provide additional configuration settings or overrides. When you 7036 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7037 the key is the configuration setting you want to override or 7038 :type config_dict: dict 7039 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7040 specify the path to a configuration file that contains additional settings. If provided, the 7041 function will read the contents of this file and update the configuration dictionary with the 7042 values found in the file, overriding any existing values with the 7043 :type config_file: str 7044 :return: The function `get_config_json` returns a dictionary containing the configuration 7045 settings. 7046 """ 7047 7048 # Create with default prioritizations 7049 config_default = self.get_config_default(name=name) 7050 configuration = config_default 7051 # log.debug(f"configuration={configuration}") 7052 7053 # Replace prioritizations from dict 7054 for config in config_dict: 7055 configuration[config] = config_dict[config] 7056 7057 # Replace prioritizations from file 7058 config_file = full_path(config_file) 7059 if config_file: 7060 if os.path.exists(config_file): 7061 with open(config_file) as config_file_content: 7062 config_file_dict = yaml.safe_load(config_file_content) 7063 for config in config_file_dict: 7064 configuration[config] = config_file_dict[config] 7065 else: 7066 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7067 log.error(msg_error) 7068 raise ValueError(msg_error) 7069 7070 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
7072 def prioritization( 7073 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7074 ) -> bool: 7075 """ 7076 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7077 prioritizes variants based on configured profiles and criteria. 7078 7079 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7080 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7081 a table name is provided, the method will prioritize the variants in that specific table 7082 :type table: str 7083 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7084 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7085 provided, the code will use a default prefix value of "PZ" 7086 :type pz_prefix: str 7087 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7088 additional parameters specific to the prioritization process. These parameters can include 7089 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7090 configurations needed for the prioritization of variants in a V 7091 :type pz_param: dict 7092 :return: A boolean value (True) is being returned from the `prioritization` function. 7093 """ 7094 7095 # Config 7096 config = self.get_config() 7097 7098 # Param 7099 param = self.get_param() 7100 7101 # Prioritization param 7102 if pz_param is not None: 7103 prioritization_param = pz_param 7104 else: 7105 prioritization_param = param.get("prioritization", {}) 7106 7107 # Configuration profiles 7108 prioritization_config_file = prioritization_param.get( 7109 "prioritization_config", None 7110 ) 7111 prioritization_config_file = full_path(prioritization_config_file) 7112 prioritizations_config = self.get_config_json( 7113 name="prioritizations", config_file=prioritization_config_file 7114 ) 7115 7116 # Prioritization prefix 7117 pz_prefix_default = "PZ" 7118 if pz_prefix is None: 7119 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7120 7121 # Prioritization options 7122 profiles = prioritization_param.get("profiles", []) 7123 if isinstance(profiles, str): 7124 profiles = profiles.split(",") 7125 pzfields = prioritization_param.get( 7126 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7127 ) 7128 if isinstance(pzfields, str): 7129 pzfields = pzfields.split(",") 7130 default_profile = prioritization_param.get("default_profile", None) 7131 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7132 prioritization_score_mode = prioritization_param.get( 7133 "prioritization_score_mode", "HOWARD" 7134 ) 7135 7136 # Quick Prioritizations 7137 prioritizations = param.get("prioritizations", None) 7138 if prioritizations: 7139 log.info("Quick Prioritization:") 7140 for profile in prioritizations.split(","): 7141 if profile not in profiles: 7142 profiles.append(profile) 7143 log.info(f" {profile}") 7144 7145 # If profile "ALL" provided, all profiles in the config profiles 7146 if "ALL" in profiles: 7147 profiles = list(prioritizations_config.keys()) 7148 7149 for profile in profiles: 7150 if prioritizations_config.get(profile, None): 7151 log.debug(f"Profile '{profile}' configured") 7152 else: 7153 msg_error = f"Profile '{profile}' NOT configured" 7154 log.error(msg_error) 7155 raise ValueError(msg_error) 7156 7157 if profiles: 7158 log.info(f"Prioritization... ") 7159 else: 7160 log.debug(f"No profile defined") 7161 return False 7162 7163 if not default_profile and len(profiles): 7164 default_profile = profiles[0] 7165 7166 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7167 log.debug("Profiles to check: " + str(list(profiles))) 7168 7169 # Variables 7170 if table is not None: 7171 table_variants = table 7172 else: 7173 table_variants = self.get_table_variants(clause="update") 7174 log.debug(f"Table to prioritize: {table_variants}") 7175 7176 # Added columns 7177 added_columns = [] 7178 7179 # Create list of PZfields 7180 # List of PZFields 7181 list_of_pzfields_original = pzfields + [ 7182 pzfield + pzfields_sep + profile 7183 for pzfield in pzfields 7184 for profile in profiles 7185 ] 7186 list_of_pzfields = [] 7187 log.debug(f"{list_of_pzfields_original}") 7188 7189 # Remove existing PZfields to use if exists 7190 for pzfield in list_of_pzfields_original: 7191 if self.get_header().infos.get(pzfield, None) is None: 7192 list_of_pzfields.append(pzfield) 7193 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7194 else: 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7196 7197 if list_of_pzfields: 7198 7199 # Explode Infos prefix 7200 explode_infos_prefix = self.get_explode_infos_prefix() 7201 7202 # PZfields tags description 7203 PZfields_INFOS = { 7204 f"{pz_prefix}Tags": { 7205 "ID": f"{pz_prefix}Tags", 7206 "Number": ".", 7207 "Type": "String", 7208 "Description": "Variant tags based on annotation criteria", 7209 }, 7210 f"{pz_prefix}Score": { 7211 "ID": f"{pz_prefix}Score", 7212 "Number": 1, 7213 "Type": "Integer", 7214 "Description": "Variant score based on annotation criteria", 7215 }, 7216 f"{pz_prefix}Flag": { 7217 "ID": f"{pz_prefix}Flag", 7218 "Number": 1, 7219 "Type": "String", 7220 "Description": "Variant flag based on annotation criteria", 7221 }, 7222 f"{pz_prefix}Comment": { 7223 "ID": f"{pz_prefix}Comment", 7224 "Number": ".", 7225 "Type": "String", 7226 "Description": "Variant comment based on annotation criteria", 7227 }, 7228 f"{pz_prefix}Infos": { 7229 "ID": f"{pz_prefix}Infos", 7230 "Number": ".", 7231 "Type": "String", 7232 "Description": "Variant infos based on annotation criteria", 7233 }, 7234 f"{pz_prefix}Class": { 7235 "ID": f"{pz_prefix}Class", 7236 "Number": ".", 7237 "Type": "String", 7238 "Description": "Variant class based on annotation criteria", 7239 }, 7240 } 7241 7242 # Create INFO fields if not exist 7243 for field in PZfields_INFOS: 7244 field_ID = PZfields_INFOS[field]["ID"] 7245 field_description = PZfields_INFOS[field]["Description"] 7246 if field_ID not in self.get_header().infos and field_ID in pzfields: 7247 field_description = ( 7248 PZfields_INFOS[field]["Description"] 7249 + f", profile {default_profile}" 7250 ) 7251 self.get_header().infos[field_ID] = vcf.parser._Info( 7252 field_ID, 7253 PZfields_INFOS[field]["Number"], 7254 PZfields_INFOS[field]["Type"], 7255 field_description, 7256 "unknown", 7257 "unknown", 7258 code_type_map[PZfields_INFOS[field]["Type"]], 7259 ) 7260 7261 # Create INFO fields if not exist for each profile 7262 for profile in prioritizations_config: 7263 if profile in profiles or profiles == []: 7264 for field in PZfields_INFOS: 7265 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7266 field_description = ( 7267 PZfields_INFOS[field]["Description"] 7268 + f", profile {profile}" 7269 ) 7270 if ( 7271 field_ID not in self.get_header().infos 7272 and field in pzfields 7273 ): 7274 self.get_header().infos[field_ID] = vcf.parser._Info( 7275 field_ID, 7276 PZfields_INFOS[field]["Number"], 7277 PZfields_INFOS[field]["Type"], 7278 field_description, 7279 "unknown", 7280 "unknown", 7281 code_type_map[PZfields_INFOS[field]["Type"]], 7282 ) 7283 7284 # Header 7285 for pzfield in list_of_pzfields: 7286 if re.match(f"{pz_prefix}Score.*", pzfield): 7287 added_column = self.add_column( 7288 table_name=table_variants, 7289 column_name=pzfield, 7290 column_type="INTEGER", 7291 default_value="0", 7292 ) 7293 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7294 added_column = self.add_column( 7295 table_name=table_variants, 7296 column_name=pzfield, 7297 column_type="BOOLEAN", 7298 default_value="1", 7299 ) 7300 elif re.match(f"{pz_prefix}Class.*", pzfield): 7301 added_column = self.add_column( 7302 table_name=table_variants, 7303 column_name=pzfield, 7304 column_type="VARCHAR[]", 7305 default_value="null", 7306 ) 7307 else: 7308 added_column = self.add_column( 7309 table_name=table_variants, 7310 column_name=pzfield, 7311 column_type="STRING", 7312 default_value="''", 7313 ) 7314 added_columns.append(added_column) 7315 7316 # Profiles 7317 if profiles: 7318 7319 # foreach profile in configuration file 7320 for profile in prioritizations_config: 7321 7322 # If profile is asked in param, or ALL are asked (empty profile []) 7323 if profile in profiles or profiles == []: 7324 log.info(f"Profile '{profile}'") 7325 7326 sql_set_info_option = "" 7327 7328 sql_set_info = [] 7329 7330 # PZ fields set 7331 7332 # PZScore 7333 if ( 7334 f"{pz_prefix}Score{pzfields_sep}{profile}" 7335 in list_of_pzfields 7336 ): 7337 sql_set_info.append( 7338 f""" 7339 concat( 7340 '{pz_prefix}Score{pzfields_sep}{profile}=', 7341 {pz_prefix}Score{pzfields_sep}{profile} 7342 ) 7343 """ 7344 ) 7345 if ( 7346 profile == default_profile 7347 and f"{pz_prefix}Score" in list_of_pzfields 7348 ): 7349 sql_set_info.append( 7350 f""" 7351 concat( 7352 '{pz_prefix}Score=', 7353 {pz_prefix}Score{pzfields_sep}{profile} 7354 ) 7355 """ 7356 ) 7357 7358 # PZFlag 7359 if ( 7360 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7361 in list_of_pzfields 7362 ): 7363 sql_set_info.append( 7364 f""" 7365 concat( 7366 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7367 CASE 7368 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7369 THEN 'PASS' 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7371 THEN 'FILTERED' 7372 END 7373 ) 7374 """ 7375 ) 7376 if ( 7377 profile == default_profile 7378 and f"{pz_prefix}Flag" in list_of_pzfields 7379 ): 7380 sql_set_info.append( 7381 f""" 7382 concat( 7383 '{pz_prefix}Flag=', 7384 CASE 7385 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7386 THEN 'PASS' 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7388 THEN 'FILTERED' 7389 END 7390 ) 7391 """ 7392 ) 7393 7394 # PZClass 7395 if ( 7396 f"{pz_prefix}Class{pzfields_sep}{profile}" 7397 in list_of_pzfields 7398 ): 7399 sql_set_info.append( 7400 f""" 7401 concat( 7402 '{pz_prefix}Class{pzfields_sep}{profile}=', 7403 CASE 7404 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7405 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7406 ELSE '.' 7407 END 7408 ) 7409 7410 """ 7411 ) 7412 if ( 7413 profile == default_profile 7414 and f"{pz_prefix}Class" in list_of_pzfields 7415 ): 7416 sql_set_info.append( 7417 f""" 7418 concat( 7419 '{pz_prefix}Class=', 7420 CASE 7421 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7422 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7423 ELSE '.' 7424 END 7425 ) 7426 """ 7427 ) 7428 7429 # PZComment 7430 if ( 7431 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7432 in list_of_pzfields 7433 ): 7434 sql_set_info.append( 7435 f""" 7436 CASE 7437 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7438 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7439 ELSE '' 7440 END 7441 """ 7442 ) 7443 if ( 7444 profile == default_profile 7445 and f"{pz_prefix}Comment" in list_of_pzfields 7446 ): 7447 sql_set_info.append( 7448 f""" 7449 CASE 7450 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7451 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7452 ELSE '' 7453 END 7454 """ 7455 ) 7456 7457 # PZInfos 7458 if ( 7459 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7460 in list_of_pzfields 7461 ): 7462 sql_set_info.append( 7463 f""" 7464 CASE 7465 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7466 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7467 ELSE '' 7468 END 7469 """ 7470 ) 7471 if ( 7472 profile == default_profile 7473 and f"{pz_prefix}Infos" in list_of_pzfields 7474 ): 7475 sql_set_info.append( 7476 f""" 7477 CASE 7478 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7479 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7480 ELSE '' 7481 END 7482 """ 7483 ) 7484 7485 # Merge PZfields 7486 sql_set_info_option = "" 7487 sql_set_sep = "" 7488 for sql_set in sql_set_info: 7489 if sql_set_sep: 7490 sql_set_info_option += f""" 7491 , concat('{sql_set_sep}', {sql_set}) 7492 """ 7493 else: 7494 sql_set_info_option += f""" 7495 , {sql_set} 7496 """ 7497 sql_set_sep = ";" 7498 7499 sql_queries = [] 7500 for annotation in prioritizations_config[profile]: 7501 7502 # skip special sections 7503 if annotation.startswith("_"): 7504 continue 7505 7506 # For each criterions 7507 for criterion in prioritizations_config[profile][ 7508 annotation 7509 ]: 7510 7511 # Criterion mode 7512 criterion_mode = None 7513 if np.any( 7514 np.isin(list(criterion.keys()), ["type", "value"]) 7515 ): 7516 criterion_mode = "operation" 7517 elif np.any( 7518 np.isin(list(criterion.keys()), ["sql", "fields"]) 7519 ): 7520 criterion_mode = "sql" 7521 log.debug(f"Criterion Mode: {criterion_mode}") 7522 7523 # Criterion parameters 7524 criterion_type = criterion.get("type", None) 7525 criterion_value = criterion.get("value", None) 7526 criterion_sql = criterion.get("sql", None) 7527 criterion_fields = criterion.get("fields", None) 7528 criterion_score = criterion.get("score", 0) 7529 criterion_flag = criterion.get("flag", "PASS") 7530 criterion_class = criterion.get("class", None) 7531 criterion_flag_bool = criterion_flag == "PASS" 7532 criterion_comment = ( 7533 ", ".join(criterion.get("comment", [])) 7534 .replace("'", "''") 7535 .replace(";", ",") 7536 .replace("\t", " ") 7537 ) 7538 criterion_infos = ( 7539 str(criterion) 7540 .replace("'", "''") 7541 .replace(";", ",") 7542 .replace("\t", " ") 7543 ) 7544 7545 # SQL 7546 if criterion_sql is not None and isinstance( 7547 criterion_sql, list 7548 ): 7549 criterion_sql = " ".join(criterion_sql) 7550 7551 # Fields and explode 7552 if criterion_fields is None: 7553 criterion_fields = [annotation] 7554 if not isinstance(criterion_fields, list): 7555 criterion_fields = str(criterion_fields).split(",") 7556 7557 # Class 7558 if criterion_class is not None and not isinstance( 7559 criterion_class, list 7560 ): 7561 criterion_class = str(criterion_class).split(",") 7562 7563 for annotation_field in criterion_fields: 7564 7565 # Explode specific annotation 7566 log.debug( 7567 f"Explode annotation '{annotation_field}'" 7568 ) 7569 added_columns += self.explode_infos( 7570 prefix=explode_infos_prefix, 7571 fields=[annotation_field], 7572 table=table_variants, 7573 ) 7574 extra_infos = self.get_extra_infos( 7575 table=table_variants 7576 ) 7577 7578 # Check if annotation field is present 7579 if ( 7580 f"{explode_infos_prefix}{annotation_field}" 7581 not in extra_infos 7582 ): 7583 msq_err = f"Annotation '{annotation_field}' not in data" 7584 log.error(msq_err) 7585 raise ValueError(msq_err) 7586 else: 7587 log.debug( 7588 f"Annotation '{annotation_field}' in data" 7589 ) 7590 7591 sql_set = [] 7592 sql_set_info = [] 7593 7594 # PZ fields set 7595 7596 # PZScore 7597 if ( 7598 f"{pz_prefix}Score{pzfields_sep}{profile}" 7599 in list_of_pzfields 7600 ): 7601 # VaRank prioritization score mode 7602 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7603 sql_set.append( 7604 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7605 ) 7606 # default HOWARD prioritization score mode 7607 else: 7608 sql_set.append( 7609 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7610 ) 7611 7612 # PZFlag 7613 if ( 7614 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7615 in list_of_pzfields 7616 ): 7617 sql_set.append( 7618 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7619 ) 7620 7621 # PZClass 7622 if ( 7623 f"{pz_prefix}Class{pzfields_sep}{profile}" 7624 in list_of_pzfields 7625 and criterion_class is not None 7626 ): 7627 sql_set.append( 7628 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7629 ) 7630 7631 # PZComment 7632 if ( 7633 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7634 in list_of_pzfields 7635 ): 7636 sql_set.append( 7637 f""" 7638 {pz_prefix}Comment{pzfields_sep}{profile} = 7639 concat( 7640 {pz_prefix}Comment{pzfields_sep}{profile}, 7641 CASE 7642 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7643 THEN ', ' 7644 ELSE '' 7645 END, 7646 '{criterion_comment}' 7647 ) 7648 """ 7649 ) 7650 7651 # PZInfos 7652 if ( 7653 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7654 in list_of_pzfields 7655 ): 7656 sql_set.append( 7657 f""" 7658 {pz_prefix}Infos{pzfields_sep}{profile} = 7659 concat( 7660 {pz_prefix}Infos{pzfields_sep}{profile}, 7661 '{criterion_infos}' 7662 ) 7663 """ 7664 ) 7665 sql_set_option = ",".join(sql_set) 7666 7667 # Criterion and comparison 7668 if sql_set_option: 7669 7670 if criterion_mode in ["operation"]: 7671 7672 try: 7673 float(criterion_value) 7674 sql_update = f""" 7675 UPDATE {table_variants} 7676 SET {sql_set_option} 7677 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7678 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7679 """ 7680 except: 7681 contains_option = "" 7682 if criterion_type == "contains": 7683 contains_option = ".*" 7684 sql_update = f""" 7685 UPDATE {table_variants} 7686 SET {sql_set_option} 7687 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7688 """ 7689 sql_queries.append(sql_update) 7690 7691 elif criterion_mode in ["sql"]: 7692 7693 sql_update = f""" 7694 UPDATE {table_variants} 7695 SET {sql_set_option} 7696 WHERE {criterion_sql} 7697 """ 7698 sql_queries.append(sql_update) 7699 7700 else: 7701 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7702 log.error(msg_err) 7703 raise ValueError(msg_err) 7704 7705 else: 7706 log.warning( 7707 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7708 ) 7709 7710 # PZTags 7711 if ( 7712 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7713 in list_of_pzfields 7714 ): 7715 7716 # Create PZFalgs value 7717 pztags_value = "" 7718 pztags_sep_default = "," 7719 pztags_sep = "" 7720 for pzfield in pzfields: 7721 if pzfield not in [f"{pz_prefix}Tags"]: 7722 if ( 7723 f"{pzfield}{pzfields_sep}{profile}" 7724 in list_of_pzfields 7725 ): 7726 if pzfield in [f"{pz_prefix}Flag"]: 7727 pztags_value += f"""{pztags_sep}{pzfield}#', 7728 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7729 THEN 'PASS' 7730 ELSE 'FILTERED' 7731 END, '""" 7732 elif pzfield in [f"{pz_prefix}Class"]: 7733 pztags_value += f"""{pztags_sep}{pzfield}#', 7734 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7735 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7736 ELSE '.' 7737 END, '""" 7738 else: 7739 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7740 pztags_sep = pztags_sep_default 7741 7742 # Add Query update for PZFlags 7743 sql_update_pztags = f""" 7744 UPDATE {table_variants} 7745 SET INFO = concat( 7746 INFO, 7747 CASE WHEN INFO NOT in ('','.') 7748 THEN ';' 7749 ELSE '' 7750 END, 7751 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7752 ) 7753 """ 7754 sql_queries.append(sql_update_pztags) 7755 7756 # Add Query update for PZFlags for default 7757 if profile == default_profile: 7758 sql_update_pztags_default = f""" 7759 UPDATE {table_variants} 7760 SET INFO = concat( 7761 INFO, 7762 ';', 7763 '{pz_prefix}Tags={pztags_value}' 7764 ) 7765 """ 7766 sql_queries.append(sql_update_pztags_default) 7767 7768 log.info(f"""Profile '{profile}' - Prioritization... """) 7769 7770 if sql_queries: 7771 7772 for sql_query in sql_queries: 7773 log.debug( 7774 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7775 ) 7776 self.conn.execute(sql_query) 7777 7778 log.info(f"""Profile '{profile}' - Update... """) 7779 sql_query_update = f""" 7780 UPDATE {table_variants} 7781 SET INFO = 7782 concat( 7783 CASE 7784 WHEN INFO NOT IN ('','.') 7785 THEN concat(INFO, ';') 7786 ELSE '' 7787 END 7788 {sql_set_info_option} 7789 ) 7790 """ 7791 self.conn.execute(sql_query_update) 7792 7793 else: 7794 7795 log.warning(f"No profiles in parameters") 7796 7797 # Remove added columns 7798 for added_column in added_columns: 7799 self.drop_column(column=added_column) 7800 7801 # Explode INFOS fields into table fields 7802 if self.get_explode_infos(): 7803 self.explode_infos( 7804 prefix=self.get_explode_infos_prefix(), 7805 fields=self.get_explode_infos_fields(), 7806 force=True, 7807 ) 7808 7809 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7815 def annotation_hgvs(self, threads: int = None) -> None: 7816 """ 7817 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7818 coordinates and alleles. 7819 7820 :param threads: The `threads` parameter is an optional integer that specifies the number of 7821 threads to use for parallel processing. If no value is provided, it will default to the number 7822 of threads obtained from the `get_threads()` method 7823 :type threads: int 7824 """ 7825 7826 # Function for each partition of the Dask Dataframe 7827 def partition_function(partition): 7828 """ 7829 The function `partition_function` applies the `annotation_hgvs_partition` function to 7830 each row of a DataFrame called `partition`. 7831 7832 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7833 to be processed 7834 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7835 the "partition" dataframe along the axis 1. 7836 """ 7837 return partition.apply(annotation_hgvs_partition, axis=1) 7838 7839 def annotation_hgvs_partition(row) -> str: 7840 """ 7841 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7842 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7843 7844 :param row: A dictionary-like object that contains the values for the following keys: 7845 :return: a string that contains the HGVS names associated with the given row of data. 7846 """ 7847 7848 chr = row["CHROM"] 7849 pos = row["POS"] 7850 ref = row["REF"] 7851 alt = row["ALT"] 7852 7853 # Find list of associated transcripts 7854 transcripts_list = list( 7855 polars_conn.execute( 7856 f""" 7857 SELECT transcript 7858 FROM refseq_df 7859 WHERE CHROM='{chr}' 7860 AND POS={pos} 7861 """ 7862 )["transcript"] 7863 ) 7864 7865 # Full HGVS annotation in list 7866 hgvs_full_list = [] 7867 7868 for transcript_name in transcripts_list: 7869 7870 # Transcript 7871 transcript = get_transcript( 7872 transcripts=transcripts, transcript_name=transcript_name 7873 ) 7874 # Exon 7875 if use_exon: 7876 exon = transcript.find_exon_number(pos) 7877 else: 7878 exon = None 7879 # Protein 7880 transcript_protein = None 7881 if use_protein or add_protein or full_format: 7882 transcripts_protein = list( 7883 polars_conn.execute( 7884 f""" 7885 SELECT protein 7886 FROM refseqlink_df 7887 WHERE transcript='{transcript_name}' 7888 LIMIT 1 7889 """ 7890 )["protein"] 7891 ) 7892 if len(transcripts_protein): 7893 transcript_protein = transcripts_protein[0] 7894 7895 # HGVS name 7896 hgvs_name = format_hgvs_name( 7897 chr, 7898 pos, 7899 ref, 7900 alt, 7901 genome=genome, 7902 transcript=transcript, 7903 transcript_protein=transcript_protein, 7904 exon=exon, 7905 use_gene=use_gene, 7906 use_protein=use_protein, 7907 full_format=full_format, 7908 use_version=use_version, 7909 codon_type=codon_type, 7910 ) 7911 hgvs_full_list.append(hgvs_name) 7912 if add_protein and not use_protein and not full_format: 7913 hgvs_name = format_hgvs_name( 7914 chr, 7915 pos, 7916 ref, 7917 alt, 7918 genome=genome, 7919 transcript=transcript, 7920 transcript_protein=transcript_protein, 7921 exon=exon, 7922 use_gene=use_gene, 7923 use_protein=True, 7924 full_format=False, 7925 use_version=use_version, 7926 codon_type=codon_type, 7927 ) 7928 hgvs_full_list.append(hgvs_name) 7929 7930 # Create liste of HGVS annotations 7931 hgvs_full = ",".join(hgvs_full_list) 7932 7933 return hgvs_full 7934 7935 # Polars connexion 7936 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7937 7938 # Config 7939 config = self.get_config() 7940 7941 # Databases 7942 # Genome 7943 databases_genomes_folders = ( 7944 config.get("folders", {}) 7945 .get("databases", {}) 7946 .get("genomes", DEFAULT_GENOME_FOLDER) 7947 ) 7948 databases_genome = ( 7949 config.get("folders", {}).get("databases", {}).get("genomes", "") 7950 ) 7951 # refseq database folder 7952 databases_refseq_folders = ( 7953 config.get("folders", {}) 7954 .get("databases", {}) 7955 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7956 ) 7957 # refseq 7958 databases_refseq = config.get("databases", {}).get("refSeq", None) 7959 # refSeqLink 7960 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7961 7962 # Param 7963 param = self.get_param() 7964 7965 # Quick HGVS 7966 if "hgvs_options" in param and param.get("hgvs_options", ""): 7967 log.info(f"Quick HGVS Annotation:") 7968 if not param.get("hgvs", None): 7969 param["hgvs"] = {} 7970 for option in param.get("hgvs_options", "").split(","): 7971 option_var_val = option.split("=") 7972 option_var = option_var_val[0] 7973 if len(option_var_val) > 1: 7974 option_val = option_var_val[1] 7975 else: 7976 option_val = "True" 7977 if option_val.upper() in ["TRUE"]: 7978 option_val = True 7979 elif option_val.upper() in ["FALSE"]: 7980 option_val = False 7981 log.info(f" {option_var}={option_val}") 7982 param["hgvs"][option_var] = option_val 7983 7984 # Check if HGVS annotation enabled 7985 if "hgvs" in param: 7986 log.info(f"HGVS Annotation... ") 7987 for hgvs_option in param.get("hgvs", {}): 7988 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7989 else: 7990 return 7991 7992 # HGVS Param 7993 param_hgvs = param.get("hgvs", {}) 7994 use_exon = param_hgvs.get("use_exon", False) 7995 use_gene = param_hgvs.get("use_gene", False) 7996 use_protein = param_hgvs.get("use_protein", False) 7997 add_protein = param_hgvs.get("add_protein", False) 7998 full_format = param_hgvs.get("full_format", False) 7999 use_version = param_hgvs.get("use_version", False) 8000 codon_type = param_hgvs.get("codon_type", "3") 8001 8002 # refSseq refSeqLink 8003 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8004 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8005 8006 # Assembly 8007 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8008 8009 # Genome 8010 genome_file = None 8011 if find_genome(databases_genome): 8012 genome_file = find_genome(databases_genome) 8013 else: 8014 genome_file = find_genome( 8015 genome_path=databases_genomes_folders, assembly=assembly 8016 ) 8017 log.debug("Genome: " + str(genome_file)) 8018 8019 # refSseq 8020 refseq_file = find_file_prefix( 8021 input_file=databases_refseq, 8022 prefix="ncbiRefSeq", 8023 folder=databases_refseq_folders, 8024 assembly=assembly, 8025 ) 8026 log.debug("refSeq: " + str(refseq_file)) 8027 8028 # refSeqLink 8029 refseqlink_file = find_file_prefix( 8030 input_file=databases_refseqlink, 8031 prefix="ncbiRefSeqLink", 8032 folder=databases_refseq_folders, 8033 assembly=assembly, 8034 ) 8035 log.debug("refSeqLink: " + str(refseqlink_file)) 8036 8037 # Threads 8038 if not threads: 8039 threads = self.get_threads() 8040 log.debug("Threads: " + str(threads)) 8041 8042 # Variables 8043 table_variants = self.get_table_variants(clause="update") 8044 8045 # Get variants SNV and InDel only 8046 query_variants = f""" 8047 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8048 FROM {table_variants} 8049 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8050 """ 8051 df_variants = self.get_query_to_df(query_variants) 8052 8053 # Added columns 8054 added_columns = [] 8055 8056 # Add hgvs column in variants table 8057 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8058 added_column = self.add_column( 8059 table_variants, hgvs_column_name, "STRING", default_value=None 8060 ) 8061 added_columns.append(added_column) 8062 8063 log.debug(f"refSeq loading...") 8064 # refSeq in duckDB 8065 refseq_table = get_refseq_table( 8066 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8067 ) 8068 # Loading all refSeq in Dataframe 8069 refseq_query = f""" 8070 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8071 FROM {refseq_table} 8072 JOIN df_variants ON ( 8073 {refseq_table}.chrom = df_variants.CHROM 8074 AND {refseq_table}.txStart<=df_variants.POS 8075 AND {refseq_table}.txEnd>=df_variants.POS 8076 ) 8077 """ 8078 refseq_df = self.conn.query(refseq_query).pl() 8079 8080 if refseqlink_file: 8081 log.debug(f"refSeqLink loading...") 8082 # refSeqLink in duckDB 8083 refseqlink_table = get_refseq_table( 8084 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8085 ) 8086 # Loading all refSeqLink in Dataframe 8087 protacc_column = "protAcc_with_ver" 8088 mrnaacc_column = "mrnaAcc_with_ver" 8089 refseqlink_query = f""" 8090 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8091 FROM {refseqlink_table} 8092 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8093 WHERE protAcc_without_ver IS NOT NULL 8094 """ 8095 # Polars Dataframe 8096 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8097 8098 # Read RefSeq transcripts into a python dict/model. 8099 log.debug(f"Transcripts loading...") 8100 with tempfile.TemporaryDirectory() as tmpdir: 8101 transcripts_query = f""" 8102 COPY ( 8103 SELECT {refseq_table}.* 8104 FROM {refseq_table} 8105 JOIN df_variants ON ( 8106 {refseq_table}.chrom=df_variants.CHROM 8107 AND {refseq_table}.txStart<=df_variants.POS 8108 AND {refseq_table}.txEnd>=df_variants.POS 8109 ) 8110 ) 8111 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8112 """ 8113 self.conn.query(transcripts_query) 8114 with open(f"{tmpdir}/transcript.tsv") as infile: 8115 transcripts = read_transcripts(infile) 8116 8117 # Polars connexion 8118 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8119 8120 log.debug("Genome loading...") 8121 # Read genome sequence using pyfaidx. 8122 genome = Fasta(genome_file) 8123 8124 log.debug("Start annotation HGVS...") 8125 8126 # Create 8127 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8128 ddf = dd.from_pandas(df_variants, npartitions=threads) 8129 8130 # Use dask.dataframe.apply() to apply function on each partition 8131 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8132 8133 # Convert Dask DataFrame to Pandas Dataframe 8134 df = ddf.compute() 8135 8136 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8137 with tempfile.TemporaryDirectory() as tmpdir: 8138 df_parquet = os.path.join(tmpdir, "df.parquet") 8139 df.to_parquet(df_parquet) 8140 8141 # Update hgvs column 8142 update_variant_query = f""" 8143 UPDATE {table_variants} 8144 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8145 FROM read_parquet('{df_parquet}') as df 8146 WHERE variants."#CHROM" = df.CHROM 8147 AND variants.POS = df.POS 8148 AND variants.REF = df.REF 8149 AND variants.ALT = df.ALT 8150 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8151 """ 8152 self.execute_query(update_variant_query) 8153 8154 # Update INFO column 8155 sql_query_update = f""" 8156 UPDATE {table_variants} 8157 SET INFO = 8158 concat( 8159 CASE 8160 WHEN INFO NOT IN ('','.') 8161 THEN concat(INFO, ';') 8162 ELSE '' 8163 END, 8164 'hgvs=', 8165 {hgvs_column_name} 8166 ) 8167 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8168 """ 8169 self.execute_query(sql_query_update) 8170 8171 # Add header 8172 HGVS_INFOS = { 8173 "hgvs": { 8174 "ID": "hgvs", 8175 "Number": ".", 8176 "Type": "String", 8177 "Description": f"HGVS annotatation with HOWARD", 8178 } 8179 } 8180 8181 for field in HGVS_INFOS: 8182 field_ID = HGVS_INFOS[field]["ID"] 8183 field_description = HGVS_INFOS[field]["Description"] 8184 self.get_header().infos[field_ID] = vcf.parser._Info( 8185 field_ID, 8186 HGVS_INFOS[field]["Number"], 8187 HGVS_INFOS[field]["Type"], 8188 field_description, 8189 "unknown", 8190 "unknown", 8191 code_type_map[HGVS_INFOS[field]["Type"]], 8192 ) 8193 8194 # Remove added columns 8195 for added_column in added_columns: 8196 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
8202 def get_operations_help( 8203 self, operations_config_dict: dict = {}, operations_config_file: str = None 8204 ) -> list: 8205 8206 # Init 8207 operations_help = [] 8208 8209 # operations 8210 operations = self.get_config_json( 8211 name="calculations", 8212 config_dict=operations_config_dict, 8213 config_file=operations_config_file, 8214 ) 8215 for op in operations: 8216 op_name = operations[op].get("name", op).upper() 8217 op_description = operations[op].get("description", op_name) 8218 op_available = operations[op].get("available", False) 8219 if op_available: 8220 operations_help.append(f" {op_name}: {op_description}") 8221 8222 # Sort operations 8223 operations_help.sort() 8224 8225 # insert header 8226 operations_help.insert(0, "Available calculation operations:") 8227 8228 # Return 8229 return operations_help
8231 def calculation( 8232 self, 8233 operations: dict = {}, 8234 operations_config_dict: dict = {}, 8235 operations_config_file: str = None, 8236 ) -> None: 8237 """ 8238 It takes a list of operations, and for each operation, it checks if it's a python or sql 8239 operation, and then calls the appropriate function 8240 8241 param json example: 8242 "calculation": { 8243 "NOMEN": { 8244 "options": { 8245 "hgvs_field": "hgvs" 8246 }, 8247 "middle" : null 8248 } 8249 """ 8250 8251 # Param 8252 param = self.get_param() 8253 8254 # CHeck operations config file 8255 if operations_config_file is None: 8256 operations_config_file = param.get("calculation", {}).get( 8257 "calculation_config", None 8258 ) 8259 8260 # operations config 8261 operations_config = self.get_config_json( 8262 name="calculations", 8263 config_dict=operations_config_dict, 8264 config_file=operations_config_file, 8265 ) 8266 8267 # Upper keys 8268 operations_config = {k.upper(): v for k, v in operations_config.items()} 8269 8270 # Calculations 8271 8272 # Operations from param 8273 operations = param.get("calculation", {}).get("calculations", operations) 8274 8275 # Quick calculation - add 8276 if param.get("calculations", None): 8277 8278 # List of operations 8279 calculations_list = [ 8280 value.strip() for value in param.get("calculations", "").split(",") 8281 ] 8282 8283 # Log 8284 log.info(f"Quick Calculations:") 8285 for calculation_key in calculations_list: 8286 log.info(f" {calculation_key}") 8287 8288 # Create tmp operations (to keep operation order) 8289 operations_tmp = {} 8290 for calculation_operation in calculations_list: 8291 if calculation_operation.upper() not in operations_tmp: 8292 log.debug( 8293 f"{calculation_operation}.upper() not in {operations_tmp}" 8294 ) 8295 operations_tmp[calculation_operation.upper()] = {} 8296 add_value_into_dict( 8297 dict_tree=operations_tmp, 8298 sections=[ 8299 calculation_operation.upper(), 8300 ], 8301 value=operations.get(calculation_operation.upper(), {}), 8302 ) 8303 # Add operations already in param 8304 for calculation_operation in operations: 8305 if calculation_operation not in operations_tmp: 8306 operations_tmp[calculation_operation] = operations.get( 8307 calculation_operation, {} 8308 ) 8309 8310 # Update operations in param 8311 operations = operations_tmp 8312 8313 # Operations for calculation 8314 if not operations: 8315 operations = param.get("calculation", {}).get("calculations", {}) 8316 8317 if operations: 8318 log.info(f"Calculations...") 8319 8320 # For each operations 8321 for operation_name in operations: 8322 operation_name = operation_name.upper() 8323 if operation_name not in [""]: 8324 if operation_name in operations_config: 8325 log.info(f"Calculation '{operation_name}'") 8326 operation = operations_config[operation_name] 8327 operation_type = operation.get("type", "sql") 8328 if operation_type == "python": 8329 self.calculation_process_function( 8330 operation=operation, operation_name=operation_name 8331 ) 8332 elif operation_type == "sql": 8333 self.calculation_process_sql( 8334 operation=operation, operation_name=operation_name 8335 ) 8336 else: 8337 log.error( 8338 f"Operations config: Type '{operation_type}' NOT available" 8339 ) 8340 raise ValueError( 8341 f"Operations config: Type '{operation_type}' NOT available" 8342 ) 8343 else: 8344 log.error( 8345 f"Operations config: Calculation '{operation_name}' NOT available" 8346 ) 8347 raise ValueError( 8348 f"Operations config: Calculation '{operation_name}' NOT available" 8349 ) 8350 8351 # Explode INFOS fields into table fields 8352 if self.get_explode_infos(): 8353 self.explode_infos( 8354 prefix=self.get_explode_infos_prefix(), 8355 fields=self.get_explode_infos_fields(), 8356 force=True, 8357 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
8359 def calculation_process_sql( 8360 self, operation: dict, operation_name: str = "unknown" 8361 ) -> None: 8362 """ 8363 The `calculation_process_sql` function takes in a mathematical operation as a string and 8364 performs the operation, updating the specified table with the result. 8365 8366 :param operation: The `operation` parameter is a dictionary that contains information about the 8367 mathematical operation to be performed. It includes the following keys: 8368 :type operation: dict 8369 :param operation_name: The `operation_name` parameter is a string that represents the name of 8370 the mathematical operation being performed. It is used for logging and error handling purposes, 8371 defaults to unknown 8372 :type operation_name: str (optional) 8373 """ 8374 8375 # Operation infos 8376 operation_name = operation.get("name", "unknown") 8377 log.debug(f"process SQL {operation_name}") 8378 output_column_name = operation.get("output_column_name", operation_name) 8379 output_column_type = operation.get("output_column_type", "String") 8380 prefix = operation.get("explode_infos_prefix", "") 8381 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8382 output_column_description = operation.get( 8383 "output_column_description", f"{operation_name} operation" 8384 ) 8385 operation_query = operation.get("operation_query", None) 8386 if isinstance(operation_query, list): 8387 operation_query = " ".join(operation_query) 8388 operation_info_fields = operation.get("info_fields", []) 8389 operation_info_fields_check = operation.get("info_fields_check", False) 8390 operation_info = operation.get("operation_info", True) 8391 operation_table = operation.get( 8392 "table", self.get_table_variants(clause="alter") 8393 ) 8394 8395 # table variants 8396 if operation_table: 8397 table_variants = operation_table 8398 else: 8399 table_variants = self.get_table_variants(clause="alter") 8400 8401 if operation_query: 8402 8403 # Info fields check 8404 operation_info_fields_check_result = True 8405 if operation_info_fields_check: 8406 header_infos = self.get_header().infos 8407 for info_field in operation_info_fields: 8408 operation_info_fields_check_result = ( 8409 operation_info_fields_check_result 8410 and info_field in header_infos 8411 ) 8412 8413 # If info fields available 8414 if operation_info_fields_check_result: 8415 8416 # Added_columns 8417 added_columns = [] 8418 8419 # Create VCF header field 8420 vcf_reader = self.get_header() 8421 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8422 output_column_name, 8423 ".", 8424 output_column_type, 8425 output_column_description, 8426 "howard calculation", 8427 "0", 8428 self.code_type_map.get(output_column_type), 8429 ) 8430 8431 # Explode infos if needed 8432 log.debug(f"calculation_process_sql prefix {prefix}") 8433 added_columns += self.explode_infos( 8434 prefix=prefix, 8435 fields=[output_column_name] + operation_info_fields, 8436 force=False, 8437 table=table_variants, 8438 ) 8439 8440 # Create column 8441 added_column = self.add_column( 8442 table_name=table_variants, 8443 column_name=prefix + output_column_name, 8444 column_type=output_column_type_sql, 8445 default_value="null", 8446 ) 8447 added_columns.append(added_column) 8448 8449 # Operation calculation 8450 try: 8451 8452 # Query to update calculation column 8453 sql_update = f""" 8454 UPDATE {table_variants} 8455 SET "{prefix}{output_column_name}" = ({operation_query}) 8456 """ 8457 self.conn.execute(sql_update) 8458 8459 # Add to INFO 8460 if operation_info: 8461 sql_update_info = f""" 8462 UPDATE {table_variants} 8463 SET "INFO" = 8464 concat( 8465 CASE 8466 WHEN "INFO" IS NOT NULL 8467 THEN concat("INFO", ';') 8468 ELSE '' 8469 END, 8470 '{output_column_name}=', 8471 "{prefix}{output_column_name}" 8472 ) 8473 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8474 """ 8475 self.conn.execute(sql_update_info) 8476 8477 except: 8478 log.error( 8479 f"Operations config: Calculation '{operation_name}' query failed" 8480 ) 8481 raise ValueError( 8482 f"Operations config: Calculation '{operation_name}' query failed" 8483 ) 8484 8485 # Remove added columns 8486 for added_column in added_columns: 8487 log.debug(f"added_column: {added_column}") 8488 self.drop_column(column=added_column) 8489 8490 else: 8491 log.error( 8492 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8493 ) 8494 raise ValueError( 8495 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8496 ) 8497 8498 else: 8499 log.error( 8500 f"Operations config: Calculation '{operation_name}' query NOT defined" 8501 ) 8502 raise ValueError( 8503 f"Operations config: Calculation '{operation_name}' query NOT defined" 8504 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8506 def calculation_process_function( 8507 self, operation: dict, operation_name: str = "unknown" 8508 ) -> None: 8509 """ 8510 The `calculation_process_function` takes in an operation dictionary and performs the specified 8511 function with the given parameters. 8512 8513 :param operation: The `operation` parameter is a dictionary that contains information about the 8514 operation to be performed. It has the following keys: 8515 :type operation: dict 8516 :param operation_name: The `operation_name` parameter is a string that represents the name of 8517 the operation being performed. It is used for logging purposes, defaults to unknown 8518 :type operation_name: str (optional) 8519 """ 8520 8521 operation_name = operation["name"] 8522 log.debug(f"process Python {operation_name}") 8523 function_name = operation["function_name"] 8524 function_params = operation["function_params"] 8525 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8527 def calculation_variant_id(self) -> None: 8528 """ 8529 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8530 updates the INFO field of a variants table with the variant ID. 8531 """ 8532 8533 # variant_id annotation field 8534 variant_id_tag = self.get_variant_id_column() 8535 added_columns = [variant_id_tag] 8536 8537 # variant_id hgvs tags" 8538 vcf_infos_tags = { 8539 variant_id_tag: "howard variant ID annotation", 8540 } 8541 8542 # Variants table 8543 table_variants = self.get_table_variants() 8544 8545 # Header 8546 vcf_reader = self.get_header() 8547 8548 # Add variant_id to header 8549 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8550 variant_id_tag, 8551 ".", 8552 "String", 8553 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8554 "howard calculation", 8555 "0", 8556 self.code_type_map.get("String"), 8557 ) 8558 8559 # Update 8560 sql_update = f""" 8561 UPDATE {table_variants} 8562 SET "INFO" = 8563 concat( 8564 CASE 8565 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8566 THEN '' 8567 ELSE concat("INFO", ';') 8568 END, 8569 '{variant_id_tag}=', 8570 "{variant_id_tag}" 8571 ) 8572 """ 8573 self.conn.execute(sql_update) 8574 8575 # Remove added columns 8576 for added_column in added_columns: 8577 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8579 def calculation_extract_snpeff_hgvs( 8580 self, 8581 snpeff_hgvs: str = "snpeff_hgvs", 8582 snpeff_field: str = "ANN", 8583 ) -> None: 8584 """ 8585 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8586 annotation field in a VCF file and adds them as a new column in the variants table. 8587 8588 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8589 function is used to specify the name of the column that will store the HGVS nomenclatures 8590 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8591 snpeff_hgvs 8592 :type snpeff_hgvs: str (optional) 8593 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8594 function represents the field in the VCF file that contains SnpEff annotations. This field is 8595 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8596 to ANN 8597 :type snpeff_field: str (optional) 8598 """ 8599 8600 # Snpeff hgvs tags 8601 vcf_infos_tags = { 8602 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8603 } 8604 8605 # Prefix 8606 prefix = self.get_explode_infos_prefix() 8607 if prefix: 8608 prefix = "INFO/" 8609 8610 # snpEff fields 8611 speff_ann_infos = prefix + snpeff_field 8612 speff_hgvs_infos = prefix + snpeff_hgvs 8613 8614 # Variants table 8615 table_variants = self.get_table_variants() 8616 8617 # Header 8618 vcf_reader = self.get_header() 8619 8620 # Add columns 8621 added_columns = [] 8622 8623 # Explode HGVS field in column 8624 added_columns += self.explode_infos(fields=[snpeff_field]) 8625 8626 if snpeff_field in vcf_reader.infos: 8627 8628 log.debug(vcf_reader.infos[snpeff_field]) 8629 8630 # Extract ANN header 8631 ann_description = vcf_reader.infos[snpeff_field].desc 8632 pattern = r"'(.+?)'" 8633 match = re.search(pattern, ann_description) 8634 if match: 8635 ann_header_match = match.group(1).split(" | ") 8636 ann_header_desc = {} 8637 for i in range(len(ann_header_match)): 8638 ann_header_info = "".join( 8639 char for char in ann_header_match[i] if char.isalnum() 8640 ) 8641 ann_header_desc[ann_header_info] = ann_header_match[i] 8642 if not ann_header_desc: 8643 raise ValueError("Invalid header description format") 8644 else: 8645 raise ValueError("Invalid header description format") 8646 8647 # Create variant id 8648 variant_id_column = self.get_variant_id_column() 8649 added_columns += [variant_id_column] 8650 8651 # Create dataframe 8652 dataframe_snpeff_hgvs = self.get_query_to_df( 8653 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8654 ) 8655 8656 # Create main NOMEN column 8657 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8658 speff_ann_infos 8659 ].apply( 8660 lambda x: extract_snpeff_hgvs( 8661 str(x), header=list(ann_header_desc.values()) 8662 ) 8663 ) 8664 8665 # Add snpeff_hgvs to header 8666 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8667 snpeff_hgvs, 8668 ".", 8669 "String", 8670 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8671 "howard calculation", 8672 "0", 8673 self.code_type_map.get("String"), 8674 ) 8675 8676 # Update 8677 sql_update = f""" 8678 UPDATE variants 8679 SET "INFO" = 8680 concat( 8681 CASE 8682 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8683 THEN '' 8684 ELSE concat("INFO", ';') 8685 END, 8686 CASE 8687 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8688 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8689 THEN concat( 8690 '{snpeff_hgvs}=', 8691 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8692 ) 8693 ELSE '' 8694 END 8695 ) 8696 FROM dataframe_snpeff_hgvs 8697 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8698 8699 """ 8700 self.conn.execute(sql_update) 8701 8702 # Delete dataframe 8703 del dataframe_snpeff_hgvs 8704 gc.collect() 8705 8706 else: 8707 8708 log.warning( 8709 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8710 ) 8711 8712 # Remove added columns 8713 for added_column in added_columns: 8714 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8716 def calculation_snpeff_ann_explode( 8717 self, 8718 uniquify: bool = True, 8719 output_format: str = "fields", 8720 output_prefix: str = "snpeff_", 8721 snpeff_field: str = "ANN", 8722 ) -> None: 8723 """ 8724 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8725 exploding the HGVS field and updating variant information accordingly. 8726 8727 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8728 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8729 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8730 defaults to True 8731 :type uniquify: bool (optional) 8732 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8733 function specifies the format in which the output annotations will be generated. It has a 8734 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8735 format, defaults to fields 8736 :type output_format: str (optional) 8737 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8738 method is used to specify the prefix that will be added to the output annotations generated 8739 during the calculation process. This prefix helps to differentiate the newly added annotations 8740 from existing ones in the output data. By default, the, defaults to ANN_ 8741 :type output_prefix: str (optional) 8742 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8743 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8744 field will be processed to explode the HGVS annotations and update the variant information 8745 accordingly, defaults to ANN 8746 :type snpeff_field: str (optional) 8747 """ 8748 8749 # SnpEff annotation field 8750 snpeff_hgvs = "snpeff_ann_explode" 8751 8752 # Snpeff hgvs tags 8753 vcf_infos_tags = { 8754 snpeff_hgvs: "Explode snpEff annotations", 8755 } 8756 8757 # Prefix 8758 prefix = self.get_explode_infos_prefix() 8759 if prefix: 8760 prefix = "INFO/" 8761 8762 # snpEff fields 8763 speff_ann_infos = prefix + snpeff_field 8764 speff_hgvs_infos = prefix + snpeff_hgvs 8765 8766 # Variants table 8767 table_variants = self.get_table_variants() 8768 8769 # Header 8770 vcf_reader = self.get_header() 8771 8772 # Add columns 8773 added_columns = [] 8774 8775 # Explode HGVS field in column 8776 added_columns += self.explode_infos(fields=[snpeff_field]) 8777 log.debug(f"snpeff_field={snpeff_field}") 8778 log.debug(f"added_columns={added_columns}") 8779 8780 if snpeff_field in vcf_reader.infos: 8781 8782 # Extract ANN header 8783 ann_description = vcf_reader.infos[snpeff_field].desc 8784 pattern = r"'(.+?)'" 8785 match = re.search(pattern, ann_description) 8786 if match: 8787 ann_header_match = match.group(1).split(" | ") 8788 ann_header = [] 8789 ann_header_desc = {} 8790 for i in range(len(ann_header_match)): 8791 ann_header_info = "".join( 8792 char for char in ann_header_match[i] if char.isalnum() 8793 ) 8794 ann_header.append(ann_header_info) 8795 ann_header_desc[ann_header_info] = ann_header_match[i] 8796 if not ann_header_desc: 8797 raise ValueError("Invalid header description format") 8798 else: 8799 raise ValueError("Invalid header description format") 8800 8801 # Create variant id 8802 variant_id_column = self.get_variant_id_column() 8803 added_columns += [variant_id_column] 8804 8805 # Create dataframe 8806 dataframe_snpeff_hgvs = self.get_query_to_df( 8807 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8808 ) 8809 8810 # Create snpEff columns 8811 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8812 speff_ann_infos 8813 ].apply( 8814 lambda x: explode_snpeff_ann( 8815 str(x), 8816 uniquify=uniquify, 8817 output_format=output_format, 8818 prefix=output_prefix, 8819 header=list(ann_header_desc.values()), 8820 ) 8821 ) 8822 8823 # Header 8824 ann_annotations_prefix = "" 8825 if output_format.upper() in ["JSON"]: 8826 ann_annotations_prefix = f"{output_prefix}=" 8827 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8828 output_prefix, 8829 ".", 8830 "String", 8831 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8832 + " - JSON format", 8833 "howard calculation", 8834 "0", 8835 self.code_type_map.get("String"), 8836 ) 8837 else: 8838 for ann_annotation in ann_header: 8839 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8840 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8841 ann_annotation_id, 8842 ".", 8843 "String", 8844 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8845 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8846 "howard calculation", 8847 "0", 8848 self.code_type_map.get("String"), 8849 ) 8850 8851 # Update 8852 sql_update = f""" 8853 UPDATE variants 8854 SET "INFO" = 8855 concat( 8856 CASE 8857 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8858 THEN '' 8859 ELSE concat("INFO", ';') 8860 END, 8861 CASE 8862 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8863 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8864 THEN concat( 8865 '{ann_annotations_prefix}', 8866 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8867 ) 8868 ELSE '' 8869 END 8870 ) 8871 FROM dataframe_snpeff_hgvs 8872 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8873 8874 """ 8875 self.conn.execute(sql_update) 8876 8877 # Delete dataframe 8878 del dataframe_snpeff_hgvs 8879 gc.collect() 8880 8881 else: 8882 8883 log.warning( 8884 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8885 ) 8886 8887 # Remove added columns 8888 for added_column in added_columns: 8889 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8891 def calculation_extract_nomen(self) -> None: 8892 """ 8893 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8894 """ 8895 8896 # NOMEN field 8897 field_nomen_dict = "NOMEN_DICT" 8898 8899 # NOMEN structure 8900 nomen_dict = { 8901 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8902 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8903 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8904 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8905 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8906 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8907 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8908 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8909 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8910 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8911 } 8912 8913 # Param 8914 param = self.get_param() 8915 8916 # Threads 8917 threads = self.get_threads() 8918 8919 # Prefix 8920 prefix = self.get_explode_infos_prefix() 8921 8922 # Header 8923 vcf_reader = self.get_header() 8924 8925 # Added columns 8926 added_columns = [] 8927 8928 # Get HGVS field 8929 hgvs_field = ( 8930 param.get("calculation", {}) 8931 .get("calculations", {}) 8932 .get("NOMEN", {}) 8933 .get("options", {}) 8934 .get("hgvs_field", "hgvs") 8935 ) 8936 8937 # Get NOMEN pattern 8938 nomen_pattern = ( 8939 param.get("calculation", {}) 8940 .get("calculations", {}) 8941 .get("NOMEN", {}) 8942 .get("options", {}) 8943 .get("pattern", None) 8944 ) 8945 8946 # transcripts list of preference sources 8947 transcripts_sources = {} 8948 8949 # Get transcripts 8950 transcripts_file = ( 8951 param.get("calculation", {}) 8952 .get("calculations", {}) 8953 .get("NOMEN", {}) 8954 .get("options", {}) 8955 .get("transcripts", None) 8956 ) 8957 transcripts_file = full_path(transcripts_file) 8958 if transcripts_file: 8959 if os.path.exists(transcripts_file): 8960 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8961 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8962 transcripts_sources["file"] = transcripts_from_file 8963 else: 8964 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8965 log.error(msg_err) 8966 raise ValueError(msg_err) 8967 8968 # Get transcripts table 8969 transcripts_table = ( 8970 param.get("calculation", {}) 8971 .get("calculations", {}) 8972 .get("NOMEN", {}) 8973 .get("options", {}) 8974 .get("transcripts_table", self.get_table_variants()) 8975 ) 8976 # Get transcripts column 8977 transcripts_column = ( 8978 param.get("calculation", {}) 8979 .get("calculations", {}) 8980 .get("NOMEN", {}) 8981 .get("options", {}) 8982 .get("transcripts_column", None) 8983 ) 8984 8985 if transcripts_table and transcripts_column: 8986 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8987 # Explode if not exists 8988 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8989 else: 8990 extra_field_transcript = f"NULL" 8991 8992 # Transcripts of preference source order 8993 transcripts_order = ( 8994 param.get("calculation", {}) 8995 .get("calculations", {}) 8996 .get("NOMEN", {}) 8997 .get("options", {}) 8998 .get("transcripts_order", ["column", "file"]) 8999 ) 9000 9001 # Transcripts from file 9002 transcripts = transcripts_sources.get("file", []) 9003 9004 # Explode HGVS field in column 9005 added_columns += self.explode_infos(fields=[hgvs_field]) 9006 9007 # extra infos 9008 extra_infos = self.get_extra_infos() 9009 extra_field = prefix + hgvs_field 9010 9011 if extra_field in extra_infos: 9012 9013 # Create dataframe 9014 dataframe_hgvs = self.get_query_to_df( 9015 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9016 ) 9017 9018 # Transcripts rank 9019 transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)} 9020 transcripts_len = len(transcripts_rank) 9021 9022 # Create main NOMEN column 9023 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9024 lambda x: find_nomen( 9025 hgvs=x.hgvs, 9026 transcript=x.transcript, 9027 transcripts=transcripts_rank, 9028 pattern=nomen_pattern, 9029 transcripts_source_order=transcripts_order, 9030 transcripts_len=transcripts_len 9031 ), 9032 axis=1, 9033 ) 9034 9035 # Explode NOMEN Structure and create SQL set for update 9036 sql_nomen_fields = [] 9037 for nomen_field in nomen_dict: 9038 9039 # Create VCF header field 9040 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9041 nomen_field, 9042 ".", 9043 "String", 9044 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9045 "howard calculation", 9046 "0", 9047 self.code_type_map.get("String"), 9048 ) 9049 9050 # Add field to SQL query update 9051 sql_nomen_fields.append( 9052 f""" 9053 CASE 9054 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9055 THEN concat( 9056 ';{nomen_field}=', 9057 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9058 ) 9059 ELSE '' 9060 END 9061 """ 9062 ) 9063 9064 # SQL set for update 9065 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9066 9067 # Update 9068 sql_update = f""" 9069 UPDATE variants 9070 SET "INFO" = 9071 concat( 9072 CASE 9073 WHEN "INFO" IS NULL 9074 THEN '' 9075 ELSE "INFO" 9076 END, 9077 {sql_nomen_fields_set} 9078 ) 9079 FROM dataframe_hgvs 9080 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9081 AND variants."POS" = dataframe_hgvs."POS" 9082 AND variants."REF" = dataframe_hgvs."REF" 9083 AND variants."ALT" = dataframe_hgvs."ALT" 9084 """ 9085 self.conn.execute(sql_update) 9086 9087 # Delete dataframe 9088 del dataframe_hgvs 9089 gc.collect() 9090 9091 # Remove added columns 9092 for added_column in added_columns: 9093 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
9095 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9096 """ 9097 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9098 pipeline/sample for a variant and updates the variant information in a VCF file. 9099 9100 :param tag: The `tag` parameter is a string that represents the annotation field for the 9101 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9102 VCF header and to update the corresponding field in the variants table, defaults to 9103 findbypipeline 9104 :type tag: str (optional) 9105 """ 9106 9107 # if FORMAT and samples 9108 if ( 9109 "FORMAT" in self.get_header_columns_as_list() 9110 and self.get_header_sample_list() 9111 ): 9112 9113 # findbypipeline annotation field 9114 findbypipeline_tag = tag 9115 9116 # VCF infos tags 9117 vcf_infos_tags = { 9118 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9119 } 9120 9121 # Prefix 9122 prefix = self.get_explode_infos_prefix() 9123 9124 # Field 9125 findbypipeline_infos = prefix + findbypipeline_tag 9126 9127 # Variants table 9128 table_variants = self.get_table_variants() 9129 9130 # Header 9131 vcf_reader = self.get_header() 9132 9133 # Create variant id 9134 variant_id_column = self.get_variant_id_column() 9135 added_columns = [variant_id_column] 9136 9137 # variant_id, FORMAT and samples 9138 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9139 self.get_header_sample_list() 9140 ) 9141 9142 # Create dataframe 9143 dataframe_findbypipeline = self.get_query_to_df( 9144 f""" SELECT {samples_fields} FROM {table_variants} """ 9145 ) 9146 9147 # Create findbypipeline column 9148 dataframe_findbypipeline[findbypipeline_infos] = ( 9149 dataframe_findbypipeline.apply( 9150 lambda row: findbypipeline( 9151 row, samples=self.get_header_sample_list() 9152 ), 9153 axis=1, 9154 ) 9155 ) 9156 9157 # Add snpeff_hgvs to header 9158 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9159 findbypipeline_tag, 9160 ".", 9161 "String", 9162 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9163 "howard calculation", 9164 "0", 9165 self.code_type_map.get("String"), 9166 ) 9167 9168 # Update 9169 sql_update = f""" 9170 UPDATE variants 9171 SET "INFO" = 9172 concat( 9173 CASE 9174 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9175 THEN '' 9176 ELSE concat("INFO", ';') 9177 END, 9178 CASE 9179 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9180 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9181 THEN concat( 9182 '{findbypipeline_tag}=', 9183 dataframe_findbypipeline."{findbypipeline_infos}" 9184 ) 9185 ELSE '' 9186 END 9187 ) 9188 FROM dataframe_findbypipeline 9189 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9190 """ 9191 self.conn.execute(sql_update) 9192 9193 # Remove added columns 9194 for added_column in added_columns: 9195 self.drop_column(column=added_column) 9196 9197 # Delete dataframe 9198 del dataframe_findbypipeline 9199 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
9201 def calculation_genotype_concordance(self) -> None: 9202 """ 9203 The function `calculation_genotype_concordance` calculates the genotype concordance for 9204 multi-caller VCF files and updates the variant information in the database. 9205 """ 9206 9207 # if FORMAT and samples 9208 if ( 9209 "FORMAT" in self.get_header_columns_as_list() 9210 and self.get_header_sample_list() 9211 ): 9212 9213 # genotypeconcordance annotation field 9214 genotypeconcordance_tag = "genotypeconcordance" 9215 9216 # VCF infos tags 9217 vcf_infos_tags = { 9218 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9219 } 9220 9221 # Prefix 9222 prefix = self.get_explode_infos_prefix() 9223 9224 # Field 9225 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9226 9227 # Variants table 9228 table_variants = self.get_table_variants() 9229 9230 # Header 9231 vcf_reader = self.get_header() 9232 9233 # Create variant id 9234 variant_id_column = self.get_variant_id_column() 9235 added_columns = [variant_id_column] 9236 9237 # variant_id, FORMAT and samples 9238 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9239 self.get_header_sample_list() 9240 ) 9241 9242 # Create dataframe 9243 dataframe_genotypeconcordance = self.get_query_to_df( 9244 f""" SELECT {samples_fields} FROM {table_variants} """ 9245 ) 9246 9247 # Create genotypeconcordance column 9248 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9249 dataframe_genotypeconcordance.apply( 9250 lambda row: genotypeconcordance( 9251 row, samples=self.get_header_sample_list() 9252 ), 9253 axis=1, 9254 ) 9255 ) 9256 9257 # Add genotypeconcordance to header 9258 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9259 genotypeconcordance_tag, 9260 ".", 9261 "String", 9262 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9263 "howard calculation", 9264 "0", 9265 self.code_type_map.get("String"), 9266 ) 9267 9268 # Update 9269 sql_update = f""" 9270 UPDATE variants 9271 SET "INFO" = 9272 concat( 9273 CASE 9274 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9275 THEN '' 9276 ELSE concat("INFO", ';') 9277 END, 9278 CASE 9279 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9280 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9281 THEN concat( 9282 '{genotypeconcordance_tag}=', 9283 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9284 ) 9285 ELSE '' 9286 END 9287 ) 9288 FROM dataframe_genotypeconcordance 9289 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9290 """ 9291 self.conn.execute(sql_update) 9292 9293 # Remove added columns 9294 for added_column in added_columns: 9295 self.drop_column(column=added_column) 9296 9297 # Delete dataframe 9298 del dataframe_genotypeconcordance 9299 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
9301 def calculation_barcode(self, tag: str = "barcode") -> None: 9302 """ 9303 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9304 updates the INFO field in the file with the calculated barcode values. 9305 9306 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9307 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9308 the default tag name is set to "barcode", defaults to barcode 9309 :type tag: str (optional) 9310 """ 9311 9312 # if FORMAT and samples 9313 if ( 9314 "FORMAT" in self.get_header_columns_as_list() 9315 and self.get_header_sample_list() 9316 ): 9317 9318 # barcode annotation field 9319 if not tag: 9320 tag = "barcode" 9321 9322 # VCF infos tags 9323 vcf_infos_tags = { 9324 tag: "barcode calculation (VaRank)", 9325 } 9326 9327 # Prefix 9328 prefix = self.get_explode_infos_prefix() 9329 9330 # Field 9331 barcode_infos = prefix + tag 9332 9333 # Variants table 9334 table_variants = self.get_table_variants() 9335 9336 # Header 9337 vcf_reader = self.get_header() 9338 9339 # Create variant id 9340 variant_id_column = self.get_variant_id_column() 9341 added_columns = [variant_id_column] 9342 9343 # variant_id, FORMAT and samples 9344 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9345 self.get_header_sample_list() 9346 ) 9347 9348 # Create dataframe 9349 dataframe_barcode = self.get_query_to_df( 9350 f""" SELECT {samples_fields} FROM {table_variants} """ 9351 ) 9352 9353 # Create barcode column 9354 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9355 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9356 ) 9357 9358 # Add barcode to header 9359 vcf_reader.infos[tag] = vcf.parser._Info( 9360 tag, 9361 ".", 9362 "String", 9363 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9364 "howard calculation", 9365 "0", 9366 self.code_type_map.get("String"), 9367 ) 9368 9369 # Update 9370 sql_update = f""" 9371 UPDATE {table_variants} 9372 SET "INFO" = 9373 concat( 9374 CASE 9375 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9376 THEN '' 9377 ELSE concat("INFO", ';') 9378 END, 9379 CASE 9380 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9381 AND dataframe_barcode."{barcode_infos}" NOT NULL 9382 THEN concat( 9383 '{tag}=', 9384 dataframe_barcode."{barcode_infos}" 9385 ) 9386 ELSE '' 9387 END 9388 ) 9389 FROM dataframe_barcode 9390 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9391 """ 9392 self.conn.execute(sql_update) 9393 9394 # Remove added columns 9395 for added_column in added_columns: 9396 self.drop_column(column=added_column) 9397 9398 # Delete dataframe 9399 del dataframe_barcode 9400 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
9402 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9403 """ 9404 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9405 and updates the INFO field in the file with the calculated barcode values. 9406 9407 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9408 the barcode tag that will be added to the VCF file during the calculation process. If no value 9409 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9410 :type tag: str (optional) 9411 """ 9412 9413 # if FORMAT and samples 9414 if ( 9415 "FORMAT" in self.get_header_columns_as_list() 9416 and self.get_header_sample_list() 9417 ): 9418 9419 # barcode annotation field 9420 if not tag: 9421 tag = "BCF" 9422 9423 # VCF infos tags 9424 vcf_infos_tags = { 9425 tag: "barcode family calculation", 9426 f"{tag}S": "barcode family samples", 9427 } 9428 9429 # Param 9430 param = self.get_param() 9431 log.debug(f"param={param}") 9432 9433 # Prefix 9434 prefix = self.get_explode_infos_prefix() 9435 9436 # PED param 9437 ped = ( 9438 param.get("calculation", {}) 9439 .get("calculations", {}) 9440 .get("BARCODEFAMILY", {}) 9441 .get("family_pedigree", None) 9442 ) 9443 log.debug(f"ped={ped}") 9444 9445 # Load PED 9446 if ped: 9447 9448 # Pedigree is a file 9449 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9450 log.debug("Pedigree is file") 9451 with open(full_path(ped)) as ped: 9452 ped = yaml.safe_load(ped) 9453 9454 # Pedigree is a string 9455 elif isinstance(ped, str): 9456 log.debug("Pedigree is str") 9457 try: 9458 ped = json.loads(ped) 9459 log.debug("Pedigree is json str") 9460 except ValueError as e: 9461 ped_samples = ped.split(",") 9462 ped = {} 9463 for ped_sample in ped_samples: 9464 ped[ped_sample] = ped_sample 9465 9466 # Pedigree is a dict 9467 elif isinstance(ped, dict): 9468 log.debug("Pedigree is dict") 9469 9470 # Pedigree is not well formatted 9471 else: 9472 msg_error = "Pedigree not well formatted" 9473 log.error(msg_error) 9474 raise ValueError(msg_error) 9475 9476 # Construct list 9477 ped_samples = list(ped.values()) 9478 9479 else: 9480 log.debug("Pedigree not defined. Take all samples") 9481 ped_samples = self.get_header_sample_list() 9482 ped = {} 9483 for ped_sample in ped_samples: 9484 ped[ped_sample] = ped_sample 9485 9486 # Check pedigree 9487 if not ped or len(ped) == 0: 9488 msg_error = f"Error in pedigree: samples {ped_samples}" 9489 log.error(msg_error) 9490 raise ValueError(msg_error) 9491 9492 # Log 9493 log.info( 9494 "Calculation 'BARCODEFAMILY' - Samples: " 9495 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9496 ) 9497 log.debug(f"ped_samples={ped_samples}") 9498 9499 # Field 9500 barcode_infos = prefix + tag 9501 9502 # Variants table 9503 table_variants = self.get_table_variants() 9504 9505 # Header 9506 vcf_reader = self.get_header() 9507 9508 # Create variant id 9509 variant_id_column = self.get_variant_id_column() 9510 added_columns = [variant_id_column] 9511 9512 # variant_id, FORMAT and samples 9513 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9514 ped_samples 9515 ) 9516 9517 # Create dataframe 9518 dataframe_barcode = self.get_query_to_df( 9519 f""" SELECT {samples_fields} FROM {table_variants} """ 9520 ) 9521 9522 # Create barcode column 9523 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9524 lambda row: barcode(row, samples=ped_samples), axis=1 9525 ) 9526 9527 # Add barcode family to header 9528 # Add vaf_normalization to header 9529 vcf_reader.formats[tag] = vcf.parser._Format( 9530 id=tag, 9531 num=".", 9532 type="String", 9533 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9534 type_code=self.code_type_map.get("String"), 9535 ) 9536 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9537 id=f"{tag}S", 9538 num=".", 9539 type="String", 9540 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9541 type_code=self.code_type_map.get("String"), 9542 ) 9543 9544 # Update 9545 # for sample in ped_samples: 9546 sql_update_set = [] 9547 for sample in self.get_header_sample_list() + ["FORMAT"]: 9548 if sample in ped_samples: 9549 value = f'dataframe_barcode."{barcode_infos}"' 9550 value_samples = "'" + ",".join(ped_samples) + "'" 9551 elif sample == "FORMAT": 9552 value = f"'{tag}'" 9553 value_samples = f"'{tag}S'" 9554 else: 9555 value = "'.'" 9556 value_samples = "'.'" 9557 format_regex = r"[a-zA-Z0-9\s]" 9558 sql_update_set.append( 9559 f""" 9560 "{sample}" = 9561 concat( 9562 CASE 9563 WHEN {table_variants}."{sample}" = './.' 9564 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9565 ELSE {table_variants}."{sample}" 9566 END, 9567 ':', 9568 {value}, 9569 ':', 9570 {value_samples} 9571 ) 9572 """ 9573 ) 9574 9575 sql_update_set_join = ", ".join(sql_update_set) 9576 sql_update = f""" 9577 UPDATE {table_variants} 9578 SET {sql_update_set_join} 9579 FROM dataframe_barcode 9580 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9581 """ 9582 self.conn.execute(sql_update) 9583 9584 # Remove added columns 9585 for added_column in added_columns: 9586 self.drop_column(column=added_column) 9587 9588 # Delete dataframe 9589 del dataframe_barcode 9590 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9592 def calculation_trio(self) -> None: 9593 """ 9594 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9595 information to the INFO field of each variant. 9596 """ 9597 9598 # if FORMAT and samples 9599 if ( 9600 "FORMAT" in self.get_header_columns_as_list() 9601 and self.get_header_sample_list() 9602 ): 9603 9604 # trio annotation field 9605 trio_tag = "trio" 9606 9607 # VCF infos tags 9608 vcf_infos_tags = { 9609 "trio": "trio calculation", 9610 } 9611 9612 # Param 9613 param = self.get_param() 9614 9615 # Prefix 9616 prefix = self.get_explode_infos_prefix() 9617 9618 # Trio param 9619 trio_ped = ( 9620 param.get("calculation", {}) 9621 .get("calculations", {}) 9622 .get("TRIO", {}) 9623 .get("trio_pedigree", None) 9624 ) 9625 9626 # Load trio 9627 if trio_ped: 9628 9629 # Trio pedigree is a file 9630 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9631 log.debug("TRIO pedigree is file") 9632 with open(full_path(trio_ped)) as trio_ped: 9633 trio_ped = yaml.safe_load(trio_ped) 9634 9635 # Trio pedigree is a string 9636 elif isinstance(trio_ped, str): 9637 log.debug("TRIO pedigree is str") 9638 try: 9639 trio_ped = json.loads(trio_ped) 9640 log.debug("TRIO pedigree is json str") 9641 except ValueError as e: 9642 trio_samples = trio_ped.split(",") 9643 if len(trio_samples) == 3: 9644 trio_ped = { 9645 "father": trio_samples[0], 9646 "mother": trio_samples[1], 9647 "child": trio_samples[2], 9648 } 9649 log.debug("TRIO pedigree is list str") 9650 else: 9651 msg_error = "TRIO pedigree not well formatted" 9652 log.error(msg_error) 9653 raise ValueError(msg_error) 9654 9655 # Trio pedigree is a dict 9656 elif isinstance(trio_ped, dict): 9657 log.debug("TRIO pedigree is dict") 9658 9659 # Trio pedigree is not well formatted 9660 else: 9661 msg_error = "TRIO pedigree not well formatted" 9662 log.error(msg_error) 9663 raise ValueError(msg_error) 9664 9665 # Construct trio list 9666 trio_samples = [ 9667 trio_ped.get("father", ""), 9668 trio_ped.get("mother", ""), 9669 trio_ped.get("child", ""), 9670 ] 9671 9672 else: 9673 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9674 samples_list = self.get_header_sample_list() 9675 if len(samples_list) >= 3: 9676 trio_samples = self.get_header_sample_list()[0:3] 9677 trio_ped = { 9678 "father": trio_samples[0], 9679 "mother": trio_samples[1], 9680 "child": trio_samples[2], 9681 } 9682 else: 9683 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9684 log.error(msg_error) 9685 raise ValueError(msg_error) 9686 9687 # Check trio pedigree 9688 if not trio_ped or len(trio_ped) != 3: 9689 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9690 log.error(msg_error) 9691 raise ValueError(msg_error) 9692 9693 # Log 9694 log.info( 9695 f"Calculation 'TRIO' - Samples: " 9696 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9697 ) 9698 9699 # Field 9700 trio_infos = prefix + trio_tag 9701 9702 # Variants table 9703 table_variants = self.get_table_variants() 9704 9705 # Header 9706 vcf_reader = self.get_header() 9707 9708 # Create variant id 9709 variant_id_column = self.get_variant_id_column() 9710 added_columns = [variant_id_column] 9711 9712 # variant_id, FORMAT and samples 9713 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9714 self.get_header_sample_list() 9715 ) 9716 9717 # Create dataframe 9718 dataframe_trio = self.get_query_to_df( 9719 f""" SELECT {samples_fields} FROM {table_variants} """ 9720 ) 9721 9722 # Create trio column 9723 dataframe_trio[trio_infos] = dataframe_trio.apply( 9724 lambda row: trio(row, samples=trio_samples), axis=1 9725 ) 9726 9727 # Add trio to header 9728 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9729 trio_tag, 9730 ".", 9731 "String", 9732 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9733 "howard calculation", 9734 "0", 9735 self.code_type_map.get("String"), 9736 ) 9737 9738 # Update 9739 sql_update = f""" 9740 UPDATE {table_variants} 9741 SET "INFO" = 9742 concat( 9743 CASE 9744 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9745 THEN '' 9746 ELSE concat("INFO", ';') 9747 END, 9748 CASE 9749 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9750 AND dataframe_trio."{trio_infos}" NOT NULL 9751 THEN concat( 9752 '{trio_tag}=', 9753 dataframe_trio."{trio_infos}" 9754 ) 9755 ELSE '' 9756 END 9757 ) 9758 FROM dataframe_trio 9759 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9760 """ 9761 self.conn.execute(sql_update) 9762 9763 # Remove added columns 9764 for added_column in added_columns: 9765 self.drop_column(column=added_column) 9766 9767 # Delete dataframe 9768 del dataframe_trio 9769 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9771 def calculation_vaf_normalization(self) -> None: 9772 """ 9773 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9774 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9775 :return: The function does not return anything. 9776 """ 9777 9778 # if FORMAT and samples 9779 if ( 9780 "FORMAT" in self.get_header_columns_as_list() 9781 and self.get_header_sample_list() 9782 ): 9783 9784 # vaf_normalization annotation field 9785 vaf_normalization_tag = "VAF" 9786 9787 # VCF infos tags 9788 vcf_infos_tags = { 9789 "VAF": "VAF Variant Frequency", 9790 } 9791 9792 # Prefix 9793 prefix = self.get_explode_infos_prefix() 9794 9795 # Variants table 9796 table_variants = self.get_table_variants() 9797 9798 # Header 9799 vcf_reader = self.get_header() 9800 9801 # Do not calculate if VAF already exists 9802 if "VAF" in vcf_reader.formats: 9803 log.debug("VAF already on genotypes") 9804 return 9805 9806 # Create variant id 9807 variant_id_column = self.get_variant_id_column() 9808 added_columns = [variant_id_column] 9809 9810 # variant_id, FORMAT and samples 9811 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9812 f""" "{sample}" """ for sample in self.get_header_sample_list() 9813 ) 9814 9815 # Create dataframe 9816 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9817 log.debug(f"query={query}") 9818 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9819 9820 vaf_normalization_set = [] 9821 9822 # for each sample vaf_normalization 9823 for sample in self.get_header_sample_list(): 9824 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9825 lambda row: vaf_normalization(row, sample=sample), axis=1 9826 ) 9827 vaf_normalization_set.append( 9828 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9829 ) 9830 9831 # Add VAF to FORMAT 9832 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9833 "FORMAT" 9834 ].apply(lambda x: str(x) + ":VAF") 9835 vaf_normalization_set.append( 9836 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9837 ) 9838 9839 # Add vaf_normalization to header 9840 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9841 id=vaf_normalization_tag, 9842 num="1", 9843 type="Float", 9844 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9845 type_code=self.code_type_map.get("Float"), 9846 ) 9847 9848 # Create fields to add in INFO 9849 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9850 9851 # Update 9852 sql_update = f""" 9853 UPDATE {table_variants} 9854 SET {sql_vaf_normalization_set} 9855 FROM dataframe_vaf_normalization 9856 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9857 9858 """ 9859 self.conn.execute(sql_update) 9860 9861 # Remove added columns 9862 for added_column in added_columns: 9863 self.drop_column(column=added_column) 9864 9865 # Delete dataframe 9866 del dataframe_vaf_normalization 9867 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9869 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9870 """ 9871 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9872 field in a VCF file and updates the INFO column of the variants table with the calculated 9873 statistics. 9874 9875 :param info: The `info` parameter is a string that represents the type of information for which 9876 genotype statistics are calculated. It is used to generate various VCF info tags for the 9877 statistics, such as the number of occurrences, the list of values, the minimum value, the 9878 maximum value, the mean, the median, defaults to VAF 9879 :type info: str (optional) 9880 """ 9881 9882 # if FORMAT and samples 9883 if ( 9884 "FORMAT" in self.get_header_columns_as_list() 9885 and self.get_header_sample_list() 9886 ): 9887 9888 # vaf_stats annotation field 9889 vaf_stats_tag = info + "_stats" 9890 9891 # VCF infos tags 9892 vcf_infos_tags = { 9893 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9894 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9895 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9896 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9897 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9898 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9899 info 9900 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9901 } 9902 9903 # Prefix 9904 prefix = self.get_explode_infos_prefix() 9905 9906 # Field 9907 vaf_stats_infos = prefix + vaf_stats_tag 9908 9909 # Variants table 9910 table_variants = self.get_table_variants() 9911 9912 # Header 9913 vcf_reader = self.get_header() 9914 9915 # Create variant id 9916 variant_id_column = self.get_variant_id_column() 9917 added_columns = [variant_id_column] 9918 9919 # variant_id, FORMAT and samples 9920 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9921 self.get_header_sample_list() 9922 ) 9923 9924 # Create dataframe 9925 dataframe_vaf_stats = self.get_query_to_df( 9926 f""" SELECT {samples_fields} FROM {table_variants} """ 9927 ) 9928 9929 # Create vaf_stats column 9930 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9931 lambda row: genotype_stats( 9932 row, samples=self.get_header_sample_list(), info=info 9933 ), 9934 axis=1, 9935 ) 9936 9937 # List of vcf tags 9938 sql_vaf_stats_fields = [] 9939 9940 # Check all VAF stats infos 9941 for stat in vcf_infos_tags: 9942 9943 # Extract stats 9944 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9945 lambda x: dict(x).get(stat, "") 9946 ) 9947 9948 # Add snpeff_hgvs to header 9949 vcf_reader.infos[stat] = vcf.parser._Info( 9950 stat, 9951 ".", 9952 "String", 9953 vcf_infos_tags.get(stat, "genotype statistics"), 9954 "howard calculation", 9955 "0", 9956 self.code_type_map.get("String"), 9957 ) 9958 9959 if len(sql_vaf_stats_fields): 9960 sep = ";" 9961 else: 9962 sep = "" 9963 9964 # Create fields to add in INFO 9965 sql_vaf_stats_fields.append( 9966 f""" 9967 CASE 9968 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9969 THEN concat( 9970 '{sep}{stat}=', 9971 dataframe_vaf_stats."{stat}" 9972 ) 9973 ELSE '' 9974 END 9975 """ 9976 ) 9977 9978 # SQL set for update 9979 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9980 9981 # Update 9982 sql_update = f""" 9983 UPDATE {table_variants} 9984 SET "INFO" = 9985 concat( 9986 CASE 9987 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9988 THEN '' 9989 ELSE concat("INFO", ';') 9990 END, 9991 {sql_vaf_stats_fields_set} 9992 ) 9993 FROM dataframe_vaf_stats 9994 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9995 9996 """ 9997 self.conn.execute(sql_update) 9998 9999 # Remove added columns 10000 for added_column in added_columns: 10001 self.drop_column(column=added_column) 10002 10003 # Delete dataframe 10004 del dataframe_vaf_stats 10005 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
10007 def calculation_transcripts_annotation( 10008 self, info_json: str = None, info_format: str = None 10009 ) -> None: 10010 """ 10011 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10012 field to it if transcripts are available. 10013 10014 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10015 is a string parameter that represents the information field to be used in the transcripts JSON. 10016 It is used to specify the JSON format for the transcripts information. If no value is provided 10017 when calling the method, it defaults to " 10018 :type info_json: str 10019 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10020 method is a string parameter that specifies the format of the information field to be used in 10021 the transcripts JSON. It is used to define the format of the information field 10022 :type info_format: str 10023 """ 10024 10025 # Create transcripts table 10026 transcripts_table = self.create_transcript_view() 10027 10028 # Add info field 10029 if transcripts_table: 10030 self.transcript_view_to_variants( 10031 transcripts_table=transcripts_table, 10032 transcripts_info_field_json=info_json, 10033 transcripts_info_field_format=info_format, 10034 ) 10035 else: 10036 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
10038 def calculation_transcripts_prioritization(self) -> None: 10039 """ 10040 The function `calculation_transcripts_prioritization` creates a transcripts table and 10041 prioritizes transcripts based on certain criteria. 10042 """ 10043 10044 # Create transcripts table 10045 transcripts_table = self.create_transcript_view() 10046 10047 # Add info field 10048 if transcripts_table: 10049 self.transcripts_prioritization(transcripts_table=transcripts_table) 10050 else: 10051 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
10053 def calculation_transcripts_export(self) -> None: 10054 """ """ 10055 10056 # Create transcripts table 10057 transcripts_table = self.create_transcript_view() 10058 10059 # Add info field 10060 if transcripts_table: 10061 self.transcripts_export(transcripts_table=transcripts_table) 10062 else: 10063 log.info("No Transcripts to process. Check param.json file configuration")
10069 def transcripts_export( 10070 self, transcripts_table: str = None, param: dict = {} 10071 ) -> bool: 10072 """ """ 10073 10074 log.debug("Start transcripts export...") 10075 10076 # Param 10077 if not param: 10078 param = self.get_param() 10079 10080 # Param export 10081 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10082 10083 # Output file 10084 transcripts_export_output = param_transcript_export.get("output", None) 10085 10086 if not param_transcript_export or not transcripts_export_output: 10087 log.warning(f"No transcriipts export parameters defined!") 10088 return False 10089 10090 # List of transcripts annotations 10091 query_describe = f""" 10092 SELECT column_name 10093 FROM ( 10094 DESCRIBE SELECT * FROM {transcripts_table} 10095 ) 10096 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10097 """ 10098 transcripts_annotations_list = list( 10099 self.get_query_to_df(query=query_describe)["column_name"] 10100 ) 10101 10102 # Create transcripts table for export 10103 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10104 random.choices(string.ascii_uppercase + string.digits, k=10) 10105 ) 10106 query_create_transcripts_table_export = f""" 10107 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10108 """ 10109 self.execute_query(query=query_create_transcripts_table_export) 10110 10111 # Output file format 10112 transcripts_export_output_format = get_file_format( 10113 filename=transcripts_export_output 10114 ) 10115 10116 # Format VCF - construct INFO 10117 if transcripts_export_output_format in ["vcf"]: 10118 10119 # Construct query update INFO and header 10120 query_update_info = [] 10121 for field in transcripts_annotations_list: 10122 10123 # If field not in header 10124 if field not in self.get_header_infos_list(): 10125 10126 # Add PZ Transcript in header 10127 self.get_header().infos[field] = vcf.parser._Info( 10128 field, 10129 ".", 10130 "String", 10131 f"Annotation '{field}' from transcript view", 10132 "unknown", 10133 "unknown", 10134 0, 10135 ) 10136 10137 # Add field as INFO/tag 10138 query_update_info.append( 10139 f""" 10140 CASE 10141 WHEN "{field}" IS NOT NULL 10142 THEN concat('{field}=', "{field}", ';') 10143 ELSE '' 10144 END 10145 """ 10146 ) 10147 10148 # Query param 10149 query_update_info_value = ( 10150 f""" concat('', {", ".join(query_update_info)}) """ 10151 ) 10152 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10153 10154 else: 10155 10156 # Query param 10157 query_update_info_value = f""" NULL """ 10158 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10159 10160 # Update query INFO column 10161 query_update = f""" 10162 UPDATE {transcripts_table_export} 10163 SET INFO = {query_update_info_value} 10164 10165 """ 10166 self.execute_query(query=query_update) 10167 10168 # Export 10169 self.export_output( 10170 output_file=transcripts_export_output, 10171 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10172 ) 10173 10174 # Drop transcripts export table 10175 query_drop_transcripts_table_export = f""" 10176 DROP TABLE {transcripts_table_export} 10177 """ 10178 self.execute_query(query=query_drop_transcripts_table_export)
10180 def transcripts_prioritization( 10181 self, transcripts_table: str = None, param: dict = {} 10182 ) -> bool: 10183 """ 10184 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10185 and updates the variants table with the prioritized information. 10186 10187 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10188 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10189 This parameter is used to identify the table where the transcripts data is stored for the 10190 prioritization process 10191 :type transcripts_table: str 10192 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10193 that contains various configuration settings for the prioritization process of transcripts. It 10194 is used to customize the behavior of the prioritization algorithm and includes settings such as 10195 the prefix for prioritization fields, default profiles, and other 10196 :type param: dict 10197 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10198 transcripts prioritization process is successfully completed, and `False` if there are any 10199 issues or if no profile is defined for transcripts prioritization. 10200 """ 10201 10202 log.debug("Start transcripts prioritization...") 10203 10204 # Param 10205 if not param: 10206 param = self.get_param() 10207 10208 # Variants table 10209 table_variants = self.get_table_variants() 10210 10211 # Transcripts table 10212 if transcripts_table is None: 10213 transcripts_table = self.create_transcript_view( 10214 transcripts_table="transcripts", param=param 10215 ) 10216 if transcripts_table is None: 10217 msg_err = "No Transcripts table availalble" 10218 log.error(msg_err) 10219 raise ValueError(msg_err) 10220 log.debug(f"transcripts_table={transcripts_table}") 10221 10222 # Get transcripts columns 10223 columns_as_list_query = f""" 10224 DESCRIBE {transcripts_table} 10225 """ 10226 columns_as_list = list( 10227 self.get_query_to_df(columns_as_list_query)["column_name"] 10228 ) 10229 10230 # Create INFO if not exists 10231 if "INFO" not in columns_as_list: 10232 query_add_info = f""" 10233 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10234 """ 10235 self.execute_query(query_add_info) 10236 10237 # Prioritization param and Force only PZ Score and Flag 10238 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10239 10240 # PZ profile by default 10241 pz_profile_default = ( 10242 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10243 ) 10244 10245 # Exit if no profile 10246 if pz_profile_default is None: 10247 log.warning("No profile defined for transcripts prioritization") 10248 return False 10249 10250 # PZ fields 10251 pz_param_pzfields = {} 10252 10253 # PZ field transcripts 10254 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10255 10256 # Add PZ Transcript in header 10257 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10258 pz_fields_transcripts, 10259 ".", 10260 "String", 10261 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10262 "unknown", 10263 "unknown", 10264 code_type_map["String"], 10265 ) 10266 10267 # Mandatory fields 10268 pz_mandatory_fields_list = [ 10269 "Score", 10270 "Flag", 10271 "Tags", 10272 "Comment", 10273 "Infos", 10274 "Class", 10275 ] 10276 pz_mandatory_fields = [] 10277 for pz_mandatory_field in pz_mandatory_fields_list: 10278 pz_mandatory_fields.append( 10279 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10280 ) 10281 10282 # PZ fields in param 10283 for pz_field in pz_param.get("pzfields", []): 10284 if pz_field in pz_mandatory_fields_list: 10285 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10286 pz_param.get("pzprefix", "PTZ") + pz_field 10287 ) 10288 else: 10289 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10290 pz_param_pzfields[pz_field] = pz_field_new 10291 10292 # Add PZ Transcript in header 10293 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10294 pz_field_new, 10295 ".", 10296 "String", 10297 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10298 "unknown", 10299 "unknown", 10300 code_type_map["String"], 10301 ) 10302 10303 # PZ fields param 10304 pz_param["pzfields"] = pz_mandatory_fields 10305 10306 # Prioritization 10307 prioritization_result = self.prioritization( 10308 table=transcripts_table, 10309 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10310 ) 10311 if not prioritization_result: 10312 log.warning("Transcripts prioritization not processed") 10313 return False 10314 10315 # PZ fields sql query 10316 query_update_select_list = [] 10317 query_update_concat_list = [] 10318 query_update_order_list = [] 10319 for pz_param_pzfield in set( 10320 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10321 ): 10322 query_update_select_list.append(f" {pz_param_pzfield}, ") 10323 10324 for pz_param_pzfield in pz_param_pzfields: 10325 query_update_concat_list.append( 10326 f""" 10327 , CASE 10328 WHEN {pz_param_pzfield} IS NOT NULL 10329 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10330 ELSE '' 10331 END 10332 """ 10333 ) 10334 10335 # Order by 10336 pz_orders = ( 10337 param.get("transcripts", {}) 10338 .get("prioritization", {}) 10339 .get("prioritization_transcripts_order", {}) 10340 ) 10341 if not pz_orders: 10342 pz_orders = { 10343 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10344 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10345 } 10346 for pz_order in pz_orders: 10347 query_update_order_list.append( 10348 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10349 ) 10350 10351 # Fields to explode 10352 fields_to_explode = ( 10353 list(pz_param_pzfields.keys()) 10354 + pz_mandatory_fields 10355 + list(pz_orders.keys()) 10356 ) 10357 # Remove transcript column as a specific transcript column 10358 if "transcript" in fields_to_explode: 10359 fields_to_explode.remove("transcript") 10360 10361 # Fields intranscripts table 10362 query_transcripts_table = f""" 10363 DESCRIBE SELECT * FROM {transcripts_table} 10364 """ 10365 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10366 10367 # Check fields to explode 10368 for field_to_explode in fields_to_explode: 10369 if field_to_explode not in self.get_header_infos_list() + list( 10370 query_transcripts_table.column_name 10371 ): 10372 msg_err = f"INFO/{field_to_explode} NOT IN header" 10373 log.error(msg_err) 10374 raise ValueError(msg_err) 10375 10376 # Explode fields to explode 10377 self.explode_infos( 10378 table=transcripts_table, 10379 fields=fields_to_explode, 10380 ) 10381 10382 # Transcript preference file 10383 transcripts_preference_file = ( 10384 param.get("transcripts", {}) 10385 .get("prioritization", {}) 10386 .get("prioritization_transcripts", {}) 10387 ) 10388 transcripts_preference_file = full_path(transcripts_preference_file) 10389 10390 # Transcript preference forced 10391 transcript_preference_force = ( 10392 param.get("transcripts", {}) 10393 .get("prioritization", {}) 10394 .get("prioritization_transcripts_force", False) 10395 ) 10396 # Transcript version forced 10397 transcript_version_force = ( 10398 param.get("transcripts", {}) 10399 .get("prioritization", {}) 10400 .get("prioritization_transcripts_version_force", False) 10401 ) 10402 10403 # Transcripts Ranking 10404 if transcripts_preference_file: 10405 10406 # Transcripts file to dataframe 10407 if os.path.exists(transcripts_preference_file): 10408 transcripts_preference_dataframe = transcripts_file_to_df( 10409 transcripts_preference_file 10410 ) 10411 else: 10412 log.error( 10413 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10414 ) 10415 raise ValueError( 10416 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10417 ) 10418 10419 # Order by depending to transcript preference forcing 10420 if transcript_preference_force: 10421 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10422 else: 10423 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10424 10425 # Transcript columns joined depend on version consideration 10426 if transcript_version_force: 10427 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10428 else: 10429 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10430 10431 # Query ranking for update 10432 query_update_ranking = f""" 10433 SELECT 10434 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10435 ROW_NUMBER() OVER ( 10436 PARTITION BY "#CHROM", POS, REF, ALT 10437 ORDER BY {order_by} 10438 ) AS rn 10439 FROM {transcripts_table} 10440 LEFT JOIN 10441 ( 10442 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10443 FROM transcripts_preference_dataframe 10444 ) AS transcripts_preference 10445 ON {transcripts_version_join} 10446 """ 10447 10448 else: 10449 10450 # Query ranking for update 10451 query_update_ranking = f""" 10452 SELECT 10453 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10454 ROW_NUMBER() OVER ( 10455 PARTITION BY "#CHROM", POS, REF, ALT 10456 ORDER BY {" , ".join(query_update_order_list)} 10457 ) AS rn 10458 FROM {transcripts_table} 10459 """ 10460 10461 # Export Transcripts prioritization infos to variants table 10462 query_update = f""" 10463 WITH RankedTranscripts AS ( 10464 {query_update_ranking} 10465 ) 10466 UPDATE {table_variants} 10467 SET 10468 INFO = CONCAT(CASE 10469 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10470 THEN '' 10471 ELSE concat("INFO", ';') 10472 END, 10473 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10474 ) 10475 FROM 10476 RankedTranscripts 10477 WHERE 10478 rn = 1 10479 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10480 AND variants."POS" = RankedTranscripts."POS" 10481 AND variants."REF" = RankedTranscripts."REF" 10482 AND variants."ALT" = RankedTranscripts."ALT" 10483 """ 10484 10485 # log.debug(f"query_update={query_update}") 10486 self.execute_query(query=query_update) 10487 10488 # Return 10489 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
10491 def create_transcript_view_from_columns_map( 10492 self, 10493 transcripts_table: str = "transcripts", 10494 columns_maps: dict = {}, 10495 added_columns: list = [], 10496 temporary_tables: list = None, 10497 annotation_fields: list = None, 10498 column_rename: dict = {}, 10499 column_clean: bool = False, 10500 column_case: str = None, 10501 ) -> tuple[list, list, list]: 10502 """ 10503 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10504 specified columns mapping for transcripts data. 10505 10506 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10507 of the table where the transcripts data is stored or will be stored in the database. This table 10508 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10509 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10510 :type transcripts_table: str (optional) 10511 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10512 about how to map columns from a transcripts table to create a view. Each entry in the 10513 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10514 typically includes details such as the main transcript column and additional information columns 10515 :type columns_maps: dict 10516 :param added_columns: The `added_columns` parameter in the 10517 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10518 that will be added to the view being created based on the columns map provided. These columns 10519 are generated by exploding the transcript information columns along with the main transcript 10520 column 10521 :type added_columns: list 10522 :param temporary_tables: The `temporary_tables` parameter in the 10523 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10524 tables created during the process of creating a transcript view from a columns map. These 10525 temporary tables are used to store intermediate results or transformations before the final view 10526 is generated 10527 :type temporary_tables: list 10528 :param annotation_fields: The `annotation_fields` parameter in the 10529 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10530 used for annotation in the query view creation process. These fields are extracted from the 10531 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10532 :type annotation_fields: list 10533 :param column_rename: The `column_rename` parameter in the 10534 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10535 custom renaming for columns during the creation of the temporary table view. This parameter 10536 provides a mapping of original column names to the desired renamed column names. By using this 10537 parameter, 10538 :type column_rename: dict 10539 :param column_clean: The `column_clean` parameter in the 10540 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10541 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10542 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10543 False 10544 :type column_clean: bool (optional) 10545 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10546 function is used to specify the case transformation to be applied to the columns during the view 10547 creation process. It allows you to control whether the column values should be converted to 10548 lowercase, uppercase, or remain unchanged 10549 :type column_case: str 10550 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10551 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10552 """ 10553 10554 log.debug("Start transcrpts view creation from columns map...") 10555 10556 # "from_columns_map": [ 10557 # { 10558 # "transcripts_column": "Ensembl_transcriptid", 10559 # "transcripts_infos_columns": [ 10560 # "genename", 10561 # "Ensembl_geneid", 10562 # "LIST_S2_score", 10563 # "LIST_S2_pred", 10564 # ], 10565 # }, 10566 # { 10567 # "transcripts_column": "Ensembl_transcriptid", 10568 # "transcripts_infos_columns": [ 10569 # "genename", 10570 # "VARITY_R_score", 10571 # "Aloft_pred", 10572 # ], 10573 # }, 10574 # ], 10575 10576 # Init 10577 if temporary_tables is None: 10578 temporary_tables = [] 10579 if annotation_fields is None: 10580 annotation_fields = [] 10581 10582 # Variants table 10583 table_variants = self.get_table_variants() 10584 10585 for columns_map in columns_maps: 10586 10587 # Transcript column 10588 transcripts_column = columns_map.get("transcripts_column", None) 10589 10590 # Transcripts infos columns 10591 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10592 10593 # Transcripts infos columns rename 10594 column_rename = columns_map.get("column_rename", column_rename) 10595 10596 # Transcripts infos columns clean 10597 column_clean = columns_map.get("column_clean", column_clean) 10598 10599 # Transcripts infos columns case 10600 column_case = columns_map.get("column_case", column_case) 10601 10602 if transcripts_column is not None: 10603 10604 # Explode 10605 added_columns += self.explode_infos( 10606 fields=[transcripts_column] + transcripts_infos_columns 10607 ) 10608 10609 # View clauses 10610 clause_select_variants = [] 10611 clause_select_tanscripts = [] 10612 for field in [transcripts_column] + transcripts_infos_columns: 10613 10614 # AS field 10615 as_field = field 10616 10617 # Rename 10618 if column_rename: 10619 as_field = column_rename.get(as_field, as_field) 10620 10621 # Clean 10622 if column_clean: 10623 as_field = clean_annotation_field(as_field) 10624 10625 # Case 10626 if column_case: 10627 if column_case.lower() in ["lower"]: 10628 as_field = as_field.lower() 10629 elif column_case.lower() in ["upper"]: 10630 as_field = as_field.upper() 10631 10632 # Clause select Variants 10633 clause_select_variants.append( 10634 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10635 ) 10636 10637 if field in [transcripts_column]: 10638 clause_select_tanscripts.append( 10639 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10640 ) 10641 else: 10642 clause_select_tanscripts.append( 10643 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10644 ) 10645 annotation_fields.append(as_field) 10646 10647 # Querey View 10648 query = f""" 10649 SELECT 10650 "#CHROM", POS, REF, ALT, INFO, 10651 "{transcripts_column}" AS 'transcript', 10652 {", ".join(clause_select_tanscripts)} 10653 FROM ( 10654 SELECT 10655 "#CHROM", POS, REF, ALT, INFO, 10656 {", ".join(clause_select_variants)} 10657 FROM {table_variants} 10658 ) 10659 WHERE "{transcripts_column}" IS NOT NULL 10660 """ 10661 10662 # Create temporary table 10663 temporary_table = transcripts_table + "".join( 10664 random.choices(string.ascii_uppercase + string.digits, k=10) 10665 ) 10666 10667 # Temporary_tables 10668 temporary_tables.append(temporary_table) 10669 query_view = f""" 10670 CREATE TEMPORARY TABLE {temporary_table} 10671 AS ({query}) 10672 """ 10673 self.execute_query(query=query_view) 10674 10675 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns - column_rename: The
column_renameparameter in thecreate_transcript_view_from_columns_mapfunction is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter, - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_columns_mapfunction is a boolean flag that determines whether the column values should be cleaned or not. If set toTrue, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_columns_mapfunction is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns
The
create_transcript_view_from_columns_mapfunction returns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
10677 def create_transcript_view_from_column_format( 10678 self, 10679 transcripts_table: str = "transcripts", 10680 column_formats: dict = {}, 10681 temporary_tables: list = None, 10682 annotation_fields: list = None, 10683 column_rename: dict = {}, 10684 column_clean: bool = False, 10685 column_case: str = None, 10686 ) -> tuple[list, list, list]: 10687 """ 10688 The `create_transcript_view_from_column_format` function generates a transcript view based on 10689 specified column formats, adds additional columns and annotation fields, and returns the list of 10690 temporary tables and annotation fields. 10691 10692 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10693 of the table containing the transcripts data. This table will be used as the base table for 10694 creating the transcript view. The default value for this parameter is "transcripts", but you can 10695 provide a different table name if needed, defaults to transcripts 10696 :type transcripts_table: str (optional) 10697 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10698 about the columns to be used for creating the transcript view. Each entry in the dictionary 10699 specifies the mapping between a transcripts column and a transcripts infos column. This 10700 parameter allows you to define how the columns from the transcripts table should be transformed 10701 or mapped 10702 :type column_formats: dict 10703 :param temporary_tables: The `temporary_tables` parameter in the 10704 `create_transcript_view_from_column_format` function is a list that stores the names of 10705 temporary views created during the process of creating a transcript view from a column format. 10706 These temporary views are used to manipulate and extract data before generating the final 10707 transcript view 10708 :type temporary_tables: list 10709 :param annotation_fields: The `annotation_fields` parameter in the 10710 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10711 that are extracted from the temporary views created during the process. These annotation fields 10712 are obtained by querying the temporary views and extracting the column names excluding specific 10713 columns like `#CH 10714 :type annotation_fields: list 10715 :param column_rename: The `column_rename` parameter in the 10716 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10717 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10718 column names to new column names in this dictionary, you can rename specific columns during the 10719 process 10720 :type column_rename: dict 10721 :param column_clean: The `column_clean` parameter in the 10722 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10723 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10724 will be cleaned during the creation of the transcript view based on the specified column format, 10725 defaults to False 10726 :type column_clean: bool (optional) 10727 :param column_case: The `column_case` parameter in the 10728 `create_transcript_view_from_column_format` function is used to specify the case transformation 10729 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10730 to convert the column names to uppercase or lowercase, respectively 10731 :type column_case: str 10732 :return: The `create_transcript_view_from_column_format` function returns two lists: 10733 `temporary_tables` and `annotation_fields`. 10734 """ 10735 10736 log.debug("Start transcrpts view creation from column format...") 10737 10738 # "from_column_format": [ 10739 # { 10740 # "transcripts_column": "ANN", 10741 # "transcripts_infos_column": "Feature_ID", 10742 # } 10743 # ], 10744 10745 # Init 10746 if temporary_tables is None: 10747 temporary_tables = [] 10748 if annotation_fields is None: 10749 annotation_fields = [] 10750 10751 for column_format in column_formats: 10752 10753 # annotation field and transcript annotation field 10754 annotation_field = column_format.get("transcripts_column", "ANN") 10755 transcript_annotation = column_format.get( 10756 "transcripts_infos_column", "Feature_ID" 10757 ) 10758 10759 # Transcripts infos columns rename 10760 column_rename = column_format.get("column_rename", column_rename) 10761 10762 # Transcripts infos columns clean 10763 column_clean = column_format.get("column_clean", column_clean) 10764 10765 # Transcripts infos columns case 10766 column_case = column_format.get("column_case", column_case) 10767 10768 # Temporary View name 10769 temporary_view_name = transcripts_table + "".join( 10770 random.choices(string.ascii_uppercase + string.digits, k=10) 10771 ) 10772 10773 # Create temporary view name 10774 temporary_view_name = self.annotation_format_to_table( 10775 uniquify=True, 10776 annotation_field=annotation_field, 10777 view_name=temporary_view_name, 10778 annotation_id=transcript_annotation, 10779 column_rename=column_rename, 10780 column_clean=column_clean, 10781 column_case=column_case, 10782 ) 10783 10784 # Annotation fields 10785 if temporary_view_name: 10786 query_annotation_fields = f""" 10787 SELECT * 10788 FROM ( 10789 DESCRIBE SELECT * 10790 FROM {temporary_view_name} 10791 ) 10792 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10793 """ 10794 df_annotation_fields = self.get_query_to_df( 10795 query=query_annotation_fields 10796 ) 10797 10798 # Add temporary view and annotation fields 10799 temporary_tables.append(temporary_view_name) 10800 annotation_fields += list(set(df_annotation_fields["column_name"])) 10801 10802 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH - column_rename: The
column_renameparameter in thecreate_transcript_view_from_column_formatfunction is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_column_formatfunction is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set toTrue, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_column_formatfunction is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
10804 def create_transcript_view( 10805 self, 10806 transcripts_table: str = None, 10807 transcripts_table_drop: bool = False, 10808 param: dict = {}, 10809 ) -> str: 10810 """ 10811 The `create_transcript_view` function generates a transcript view by processing data from a 10812 specified table based on provided parameters and structural information. 10813 10814 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10815 is used to specify the name of the table that will store the final transcript view data. If a table 10816 name is not provided, the function will create a new table to store the transcript view data, and by 10817 default,, defaults to transcripts 10818 :type transcripts_table: str (optional) 10819 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10820 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10821 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10822 the function will drop the existing transcripts table if it exists, defaults to False 10823 :type transcripts_table_drop: bool (optional) 10824 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10825 contains information needed to create a transcript view. It includes details such as the structure 10826 of the transcripts, columns mapping, column formats, and other necessary information for generating 10827 the view. This parameter allows for flexibility and customization 10828 :type param: dict 10829 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10830 created or modified during the execution of the function. 10831 """ 10832 10833 log.debug("Start transcripts view creation...") 10834 10835 # Default 10836 transcripts_table_default = "transcripts" 10837 10838 # Param 10839 if not param: 10840 param = self.get_param() 10841 10842 # Struct 10843 struct = param.get("transcripts", {}).get("struct", None) 10844 10845 # Transcript veresion 10846 transcript_id_remove_version = param.get("transcripts", {}).get( 10847 "transcript_id_remove_version", False 10848 ) 10849 10850 # Transcripts mapping 10851 transcript_id_mapping_file = param.get("transcripts", {}).get( 10852 "transcript_id_mapping_file", None 10853 ) 10854 10855 # Transcripts mapping 10856 transcript_id_mapping_force = param.get("transcripts", {}).get( 10857 "transcript_id_mapping_force", None 10858 ) 10859 10860 if struct: 10861 10862 # Transcripts table 10863 if transcripts_table is None: 10864 transcripts_table = param.get("transcripts", {}).get( 10865 "table", transcripts_table_default 10866 ) 10867 10868 # added_columns 10869 added_columns = [] 10870 10871 # Temporary tables 10872 temporary_tables = [] 10873 10874 # Annotation fields 10875 annotation_fields = [] 10876 10877 # from columns map 10878 columns_maps = struct.get("from_columns_map", []) 10879 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10880 self.create_transcript_view_from_columns_map( 10881 transcripts_table=transcripts_table, 10882 columns_maps=columns_maps, 10883 added_columns=added_columns, 10884 temporary_tables=temporary_tables, 10885 annotation_fields=annotation_fields, 10886 ) 10887 ) 10888 added_columns += added_columns_tmp 10889 temporary_tables += temporary_tables_tmp 10890 annotation_fields += annotation_fields_tmp 10891 10892 # from column format 10893 column_formats = struct.get("from_column_format", []) 10894 temporary_tables_tmp, annotation_fields_tmp = ( 10895 self.create_transcript_view_from_column_format( 10896 transcripts_table=transcripts_table, 10897 column_formats=column_formats, 10898 temporary_tables=temporary_tables, 10899 annotation_fields=annotation_fields, 10900 ) 10901 ) 10902 temporary_tables += temporary_tables_tmp 10903 annotation_fields += annotation_fields_tmp 10904 10905 # Remove some specific fields/column 10906 annotation_fields = list(set(annotation_fields)) 10907 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10908 if field in annotation_fields: 10909 annotation_fields.remove(field) 10910 10911 # Merge temporary tables query 10912 query_merge = "" 10913 for temporary_table in list(set(temporary_tables)): 10914 10915 # First temporary table 10916 if not query_merge: 10917 query_merge = f""" 10918 SELECT * FROM {temporary_table} 10919 """ 10920 # other temporary table (using UNION) 10921 else: 10922 query_merge += f""" 10923 UNION BY NAME SELECT * FROM {temporary_table} 10924 """ 10925 10926 # transcript table tmp 10927 transcript_table_tmp = "transcripts_tmp" 10928 transcript_table_tmp2 = "transcripts_tmp2" 10929 transcript_table_tmp3 = "transcripts_tmp3" 10930 10931 # Merge on transcript 10932 query_merge_on_transcripts_annotation_fields = [] 10933 10934 # Add transcript list 10935 query_merge_on_transcripts_annotation_fields.append( 10936 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10937 ) 10938 10939 # Aggregate all annotations fields 10940 for annotation_field in set(annotation_fields): 10941 query_merge_on_transcripts_annotation_fields.append( 10942 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10943 ) 10944 10945 # Transcripts mapping 10946 if transcript_id_mapping_file: 10947 10948 # Transcript dataframe 10949 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10950 transcript_id_mapping_dataframe = transcripts_file_to_df( 10951 transcript_id_mapping_file, column_names=["transcript", "alias"] 10952 ) 10953 10954 # Transcript version remove 10955 if transcript_id_remove_version: 10956 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10957 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10958 query_left_join = f""" 10959 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10960 """ 10961 else: 10962 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10963 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10964 query_left_join = f""" 10965 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10966 """ 10967 10968 # Transcript column for group by merge 10969 query_transcript_merge_group_by = """ 10970 CASE 10971 WHEN transcript_mapped NOT IN ('') 10972 THEN split_part(transcript_mapped, '.', 1) 10973 ELSE split_part(transcript_original, '.', 1) 10974 END 10975 """ 10976 10977 # Merge query 10978 transcripts_tmp2_query = f""" 10979 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10980 FROM ({query_merge}) AS {transcript_table_tmp} 10981 {query_left_join} 10982 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10983 """ 10984 10985 # Retrive columns after mege 10986 transcripts_tmp2_describe_query = f""" 10987 DESCRIBE {transcripts_tmp2_query} 10988 """ 10989 transcripts_tmp2_describe_list = list( 10990 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10991 "column_name" 10992 ] 10993 ) 10994 10995 # Create list of columns for select clause 10996 transcripts_tmp2_describe_select_clause = [] 10997 for field in transcripts_tmp2_describe_list: 10998 if field not in [ 10999 "#CHROM", 11000 "POS", 11001 "REF", 11002 "ALT", 11003 "INFO", 11004 "transcript_mapped", 11005 ]: 11006 as_field = field 11007 if field in ["transcript_original"]: 11008 as_field = "transcripts_mapped" 11009 transcripts_tmp2_describe_select_clause.append( 11010 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11011 ) 11012 11013 # Merge with mapping 11014 query_merge_on_transcripts = f""" 11015 SELECT 11016 "#CHROM", POS, REF, ALT, INFO, 11017 CASE 11018 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11019 THEN ANY_VALUE(transcript_mapped) 11020 ELSE ANY_VALUE(transcript_original) 11021 END AS transcript, 11022 {", ".join(transcripts_tmp2_describe_select_clause)} 11023 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11024 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11025 {query_transcript_merge_group_by} 11026 """ 11027 11028 # Add transcript filter from mapping file 11029 if transcript_id_mapping_force: 11030 query_merge_on_transcripts = f""" 11031 SELECT * 11032 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11033 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11034 """ 11035 11036 # No transcript mapping 11037 else: 11038 11039 # Remove transcript version 11040 if transcript_id_remove_version: 11041 query_transcript_column = f""" 11042 split_part({transcript_table_tmp}.transcript, '.', 1) 11043 """ 11044 else: 11045 query_transcript_column = """ 11046 transcript 11047 """ 11048 11049 # Query sections 11050 query_transcript_column_select = ( 11051 f"{query_transcript_column} AS transcript" 11052 ) 11053 query_transcript_column_group_by = query_transcript_column 11054 11055 # Query for transcripts view 11056 query_merge_on_transcripts = f""" 11057 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11058 FROM ({query_merge}) AS {transcript_table_tmp} 11059 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11060 """ 11061 11062 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11063 11064 # Drop transcript view is necessary 11065 if transcripts_table_drop: 11066 query_drop = f""" 11067 DROP TABLE IF EXISTS {transcripts_table}; 11068 """ 11069 self.execute_query(query=query_drop) 11070 11071 # Merge and create transcript view 11072 query_create_view = f""" 11073 CREATE TABLE IF NOT EXISTS {transcripts_table} 11074 AS {query_merge_on_transcripts} 11075 """ 11076 self.execute_query(query=query_create_view) 11077 11078 # Remove added columns 11079 for added_column in added_columns: 11080 self.drop_column(column=added_column) 11081 11082 else: 11083 11084 transcripts_table = None 11085 11086 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to False - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
11088 def annotation_format_to_table( 11089 self, 11090 uniquify: bool = True, 11091 annotation_field: str = "ANN", 11092 annotation_id: str = "Feature_ID", 11093 view_name: str = "transcripts", 11094 column_rename: dict = {}, 11095 column_clean: bool = False, 11096 column_case: str = None, 11097 ) -> str: 11098 """ 11099 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11100 structured table format, ensuring unique values and creating a temporary table for further 11101 processing or analysis. 11102 11103 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11104 unique values in the output or not. If set to `True`, the function will make sure that the 11105 output values are unique, defaults to True 11106 :type uniquify: bool (optional) 11107 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11108 that contains the annotation information for each variant. This field is used to extract the 11109 annotation details for further processing in the function. By default, it is set to "ANN", 11110 defaults to ANN 11111 :type annotation_field: str (optional) 11112 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11113 is used to specify the identifier for the annotation feature. This identifier will be used as a 11114 column name in the resulting table or view that is created based on the annotation data. It 11115 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11116 :type annotation_id: str (optional) 11117 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11118 to specify the name of the temporary table that will be created to store the transformed 11119 annotation data. This table will hold the extracted information from the annotation field in a 11120 structured format for further processing or analysis. By default,, defaults to transcripts 11121 :type view_name: str (optional) 11122 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11123 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11124 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11125 created based on the annotation data. This feature enables 11126 :type column_rename: dict 11127 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11128 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11129 If set to `True`, the function will clean the annotation field before further processing. This 11130 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11131 to False 11132 :type column_clean: bool (optional) 11133 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11134 used to specify the case transformation to be applied to the column names extracted from the 11135 annotation data. It allows you to set the case of the column names to either lowercase or 11136 uppercase for consistency or other specific requirements during the conversion 11137 :type column_case: str 11138 :return: The function `annotation_format_to_table` is returning the name of the view created, 11139 which is stored in the variable `view_name`. 11140 """ 11141 11142 # Annotation field 11143 annotation_format = "annotation_explode" 11144 11145 # Transcript annotation 11146 if column_rename: 11147 annotation_id = column_rename.get(annotation_id, annotation_id) 11148 11149 if column_clean: 11150 annotation_id = clean_annotation_field(annotation_id) 11151 11152 # Prefix 11153 prefix = self.get_explode_infos_prefix() 11154 if prefix: 11155 prefix = "INFO/" 11156 11157 # Annotation fields 11158 annotation_infos = prefix + annotation_field 11159 annotation_format_infos = prefix + annotation_format 11160 11161 # Variants table 11162 table_variants = self.get_table_variants() 11163 11164 # Header 11165 vcf_reader = self.get_header() 11166 11167 # Add columns 11168 added_columns = [] 11169 11170 # Explode HGVS field in column 11171 added_columns += self.explode_infos(fields=[annotation_field]) 11172 11173 if annotation_field in vcf_reader.infos: 11174 11175 # Extract ANN header 11176 ann_description = vcf_reader.infos[annotation_field].desc 11177 pattern = r"'(.+?)'" 11178 match = re.search(pattern, ann_description) 11179 if match: 11180 ann_header_match = match.group(1).split(" | ") 11181 ann_header = [] 11182 ann_header_desc = {} 11183 for i in range(len(ann_header_match)): 11184 ann_header_info = "".join( 11185 char for char in ann_header_match[i] if char.isalnum() 11186 ) 11187 ann_header.append(ann_header_info) 11188 ann_header_desc[ann_header_info] = ann_header_match[i] 11189 if not ann_header_desc: 11190 raise ValueError("Invalid header description format") 11191 else: 11192 raise ValueError("Invalid header description format") 11193 11194 # Create variant id 11195 variant_id_column = self.get_variant_id_column() 11196 added_columns += [variant_id_column] 11197 11198 # Create dataframe 11199 dataframe_annotation_format = self.get_query_to_df( 11200 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11201 ) 11202 11203 # Create annotation columns 11204 dataframe_annotation_format[ 11205 annotation_format_infos 11206 ] = dataframe_annotation_format[annotation_infos].apply( 11207 lambda x: explode_annotation_format( 11208 annotation=str(x), 11209 uniquify=uniquify, 11210 output_format="JSON", 11211 prefix="", 11212 header=list(ann_header_desc.values()), 11213 ) 11214 ) 11215 11216 # Find keys 11217 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11218 df_keys = self.get_query_to_df(query=query_json) 11219 11220 # Check keys 11221 query_json_key = [] 11222 for _, row in df_keys.iterrows(): 11223 11224 # Key 11225 key = row.iloc[0] 11226 key_clean = key 11227 11228 # key rename 11229 if column_rename: 11230 key_clean = column_rename.get(key_clean, key_clean) 11231 11232 # key clean 11233 if column_clean: 11234 key_clean = clean_annotation_field(key_clean) 11235 11236 # Key case 11237 if column_case: 11238 if column_case.lower() in ["lower"]: 11239 key_clean = key_clean.lower() 11240 elif column_case.lower() in ["upper"]: 11241 key_clean = key_clean.upper() 11242 11243 # Type 11244 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11245 11246 # Get DataFrame from query 11247 df_json_type = self.get_query_to_df(query=query_json_type) 11248 11249 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11250 with pd.option_context("future.no_silent_downcasting", True): 11251 df_json_type.fillna(value="", inplace=True) 11252 replace_dict = {None: np.nan, "": np.nan} 11253 df_json_type.replace(replace_dict, inplace=True) 11254 df_json_type.dropna(inplace=True) 11255 11256 # Detect column type 11257 column_type = detect_column_type(df_json_type[key_clean]) 11258 11259 # Append 11260 query_json_key.append( 11261 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11262 ) 11263 11264 # Create view 11265 query_view = f""" 11266 CREATE TEMPORARY TABLE {view_name} 11267 AS ( 11268 SELECT *, {annotation_id} AS 'transcript' 11269 FROM ( 11270 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11271 FROM dataframe_annotation_format 11272 ) 11273 ); 11274 """ 11275 self.execute_query(query=query_view) 11276 11277 else: 11278 11279 # Return None 11280 view_name = None 11281 11282 # Remove added columns 11283 for added_column in added_columns: 11284 self.drop_column(column=added_column) 11285 11286 return view_name
The annotation_format_to_table function converts annotation data from a VCF file into a
structured table format, ensuring unique values and creating a temporary table for further
processing or analysis.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts - column_rename: The
column_renameparameter in theannotation_format_to_tablemethod is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables - column_clean: The
column_cleanparameter in theannotation_format_to_tablemethod is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set toTrue, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False - column_case: The
column_caseparameter in theannotation_format_to_tablemethod is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
11288 def transcript_view_to_variants( 11289 self, 11290 transcripts_table: str = None, 11291 transcripts_column_id: str = None, 11292 transcripts_info_json: str = None, 11293 transcripts_info_field_json: str = None, 11294 transcripts_info_format: str = None, 11295 transcripts_info_field_format: str = None, 11296 param: dict = {}, 11297 ) -> bool: 11298 """ 11299 The `transcript_view_to_variants` function updates a variants table with information from 11300 transcripts in JSON format. 11301 11302 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11303 table containing the transcripts data. If this parameter is not provided, the function will 11304 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11305 :type transcripts_table: str 11306 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11307 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11308 identifier is used to match transcripts with variants in the database 11309 :type transcripts_column_id: str 11310 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11311 of the column in the variants table where the transcripts information will be stored in JSON 11312 format. This parameter allows you to define the column in the variants table that will hold the 11313 JSON-formatted information about transcripts 11314 :type transcripts_info_json: str 11315 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11316 specify the field in the VCF header that will contain information about transcripts in JSON 11317 format. This field will be added to the VCF header as an INFO field with the specified name 11318 :type transcripts_info_field_json: str 11319 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11320 format of the information about transcripts that will be stored in the variants table. This 11321 format can be used to define how the transcript information will be structured or displayed 11322 within the variants table 11323 :type transcripts_info_format: str 11324 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11325 specify the field in the VCF header that will contain information about transcripts in a 11326 specific format. This field will be added to the VCF header as an INFO field with the specified 11327 name 11328 :type transcripts_info_field_format: str 11329 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11330 that contains various configuration settings related to transcripts. It is used to provide 11331 default values for certain parameters if they are not explicitly provided when calling the 11332 method. The `param` dictionary can be passed as an argument 11333 :type param: dict 11334 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11335 if the operation is successful and `False` if certain conditions are not met. 11336 """ 11337 11338 msg_info_prefix = "Start transcripts view to variants annotations" 11339 11340 log.debug(f"{msg_info_prefix}...") 11341 11342 # Default 11343 transcripts_table_default = "transcripts" 11344 transcripts_column_id_default = "transcript" 11345 transcripts_info_json_default = None 11346 transcripts_info_format_default = None 11347 transcripts_info_field_json_default = None 11348 transcripts_info_field_format_default = None 11349 11350 # Param 11351 if not param: 11352 param = self.get_param() 11353 11354 # Transcripts table 11355 if transcripts_table is None: 11356 transcripts_table = param.get("transcripts", {}).get( 11357 "table", transcripts_table_default 11358 ) 11359 11360 # Transcripts column ID 11361 if transcripts_column_id is None: 11362 transcripts_column_id = param.get("transcripts", {}).get( 11363 "column_id", transcripts_column_id_default 11364 ) 11365 11366 # Transcripts info json 11367 if transcripts_info_json is None: 11368 transcripts_info_json = param.get("transcripts", {}).get( 11369 "transcripts_info_json", transcripts_info_json_default 11370 ) 11371 11372 # Transcripts info field JSON 11373 if transcripts_info_field_json is None: 11374 transcripts_info_field_json = param.get("transcripts", {}).get( 11375 "transcripts_info_field_json", transcripts_info_field_json_default 11376 ) 11377 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11378 # transcripts_info_json = transcripts_info_field_json 11379 11380 # Transcripts info format 11381 if transcripts_info_format is None: 11382 transcripts_info_format = param.get("transcripts", {}).get( 11383 "transcripts_info_format", transcripts_info_format_default 11384 ) 11385 11386 # Transcripts info field FORMAT 11387 if transcripts_info_field_format is None: 11388 transcripts_info_field_format = param.get("transcripts", {}).get( 11389 "transcripts_info_field_format", transcripts_info_field_format_default 11390 ) 11391 # if ( 11392 # transcripts_info_field_format is not None 11393 # and transcripts_info_format is None 11394 # ): 11395 # transcripts_info_format = transcripts_info_field_format 11396 11397 # Variants table 11398 table_variants = self.get_table_variants() 11399 11400 # Check info columns param 11401 if ( 11402 transcripts_info_json is None 11403 and transcripts_info_field_json is None 11404 and transcripts_info_format is None 11405 and transcripts_info_field_format is None 11406 ): 11407 return False 11408 11409 # Transcripts infos columns 11410 query_transcripts_infos_columns = f""" 11411 SELECT * 11412 FROM ( 11413 DESCRIBE SELECT * FROM {transcripts_table} 11414 ) 11415 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11416 """ 11417 transcripts_infos_columns = list( 11418 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11419 ) 11420 11421 # View results 11422 clause_select = [] 11423 clause_to_json = [] 11424 clause_to_format = [] 11425 for field in transcripts_infos_columns: 11426 # Do not consider INFO field for export into fields 11427 if field not in ["INFO"]: 11428 clause_select.append( 11429 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11430 ) 11431 clause_to_json.append(f""" '{field}': "{field}" """) 11432 clause_to_format.append(f""" "{field}" """) 11433 11434 # Update 11435 update_set_json = [] 11436 update_set_format = [] 11437 11438 # VCF header 11439 vcf_reader = self.get_header() 11440 11441 # Transcripts to info column in JSON 11442 if transcripts_info_json: 11443 11444 # Create column on variants table 11445 self.add_column( 11446 table_name=table_variants, 11447 column_name=transcripts_info_json, 11448 column_type="JSON", 11449 default_value=None, 11450 drop=False, 11451 ) 11452 11453 # Add header 11454 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11455 transcripts_info_json, 11456 ".", 11457 "String", 11458 "Transcripts in JSON format", 11459 "unknwon", 11460 "unknwon", 11461 self.code_type_map["String"], 11462 ) 11463 11464 # Add to update 11465 update_set_json.append( 11466 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11467 ) 11468 11469 # Transcripts to info field in JSON 11470 if transcripts_info_field_json: 11471 11472 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11473 11474 # Add to update 11475 update_set_json.append( 11476 f""" 11477 INFO = concat( 11478 CASE 11479 WHEN INFO NOT IN ('', '.') 11480 THEN INFO 11481 ELSE '' 11482 END, 11483 CASE 11484 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11485 THEN concat( 11486 ';{transcripts_info_field_json}=', 11487 t.{transcripts_info_json} 11488 ) 11489 ELSE '' 11490 END 11491 ) 11492 """ 11493 ) 11494 11495 # Add header 11496 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11497 transcripts_info_field_json, 11498 ".", 11499 "String", 11500 "Transcripts in JSON format", 11501 "unknwon", 11502 "unknwon", 11503 self.code_type_map["String"], 11504 ) 11505 11506 if update_set_json: 11507 11508 # Update query 11509 query_update = f""" 11510 UPDATE {table_variants} 11511 SET {", ".join(update_set_json)} 11512 FROM 11513 ( 11514 SELECT 11515 "#CHROM", POS, REF, ALT, 11516 concat( 11517 '{{', 11518 string_agg( 11519 '"' || "{transcripts_column_id}" || '":' || 11520 to_json(json_output) 11521 ), 11522 '}}' 11523 )::JSON AS {transcripts_info_json} 11524 FROM 11525 ( 11526 SELECT 11527 "#CHROM", POS, REF, ALT, 11528 "{transcripts_column_id}", 11529 to_json( 11530 {{{",".join(clause_to_json)}}} 11531 )::JSON AS json_output 11532 FROM 11533 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11534 WHERE "{transcripts_column_id}" IS NOT NULL 11535 ) 11536 GROUP BY "#CHROM", POS, REF, ALT 11537 ) AS t 11538 WHERE {table_variants}."#CHROM" = t."#CHROM" 11539 AND {table_variants}."POS" = t."POS" 11540 AND {table_variants}."REF" = t."REF" 11541 AND {table_variants}."ALT" = t."ALT" 11542 """ 11543 11544 self.execute_query(query=query_update) 11545 11546 # Transcripts to info column in FORMAT 11547 if transcripts_info_format: 11548 11549 # Create column on variants table 11550 self.add_column( 11551 table_name=table_variants, 11552 column_name=transcripts_info_format, 11553 column_type="VARCHAR", 11554 default_value=None, 11555 drop=False, 11556 ) 11557 11558 # Add header 11559 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11560 transcripts_info_format, 11561 ".", 11562 "String", 11563 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11564 "unknwon", 11565 "unknwon", 11566 self.code_type_map["String"], 11567 ) 11568 11569 # Add to update 11570 update_set_format.append( 11571 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11572 ) 11573 11574 else: 11575 11576 # Set variable for internal queries 11577 transcripts_info_format = "transcripts_info_format" 11578 11579 # Transcripts to info field in JSON 11580 if transcripts_info_field_format: 11581 11582 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11583 11584 # Add to update 11585 update_set_format.append( 11586 f""" 11587 INFO = concat( 11588 CASE 11589 WHEN INFO NOT IN ('', '.') 11590 THEN INFO 11591 ELSE '' 11592 END, 11593 CASE 11594 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11595 THEN concat( 11596 ';{transcripts_info_field_format}=', 11597 t.{transcripts_info_format} 11598 ) 11599 ELSE '' 11600 END 11601 ) 11602 """ 11603 ) 11604 11605 # Add header 11606 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11607 transcripts_info_field_format, 11608 ".", 11609 "String", 11610 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11611 "unknwon", 11612 "unknwon", 11613 self.code_type_map["String"], 11614 ) 11615 11616 if update_set_format: 11617 11618 # Update query 11619 query_update = f""" 11620 UPDATE {table_variants} 11621 SET {", ".join(update_set_format)} 11622 FROM 11623 ( 11624 SELECT 11625 "#CHROM", POS, REF, ALT, 11626 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11627 FROM 11628 ( 11629 SELECT 11630 "#CHROM", POS, REF, ALT, 11631 "{transcripts_column_id}", 11632 concat( 11633 "{transcripts_column_id}", 11634 '|', 11635 {", '|', ".join(clause_to_format)} 11636 ) AS {transcripts_info_format} 11637 FROM 11638 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11639 ) 11640 GROUP BY "#CHROM", POS, REF, ALT 11641 ) AS t 11642 WHERE {table_variants}."#CHROM" = t."#CHROM" 11643 AND {table_variants}."POS" = t."POS" 11644 AND {table_variants}."REF" = t."REF" 11645 AND {table_variants}."ALT" = t."ALT" 11646 """ 11647 11648 self.execute_query(query=query_update) 11649 11650 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.
11652 def rename_info_fields( 11653 self, fields_to_rename: dict = None, table: str = None 11654 ) -> dict: 11655 """ 11656 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11657 corresponding INFO fields in the variants table. 11658 11659 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11660 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11661 represent the original field names that need to be renamed, and the corresponding values 11662 represent the new names to which the fields should be 11663 :type fields_to_rename: dict 11664 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11665 the table in which the variants data is stored. This table contains information about genetic 11666 variants, and the function updates the corresponding INFO fields in this table when renaming 11667 specified fields in the VCF file header 11668 :type table: str 11669 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11670 the original field names as keys and their corresponding new names (or None if the field was 11671 removed) as values after renaming or removing specified fields in a VCF file header and updating 11672 corresponding INFO fields in the variants table. 11673 """ 11674 11675 # Init 11676 fields_renamed = {} 11677 config = self.get_config() 11678 access = config.get("access") 11679 11680 if table is None: 11681 table = self.get_table_variants() 11682 11683 # regexp replace fonction 11684 regex_replace_dict = {} 11685 regex_replace_nb = 0 11686 regex_replace_partition = 125 11687 regex_replace = "INFO" 11688 11689 if fields_to_rename is not None and access not in ["RO"]: 11690 11691 log.info("Rename or remove fields...") 11692 11693 # Header 11694 header = self.get_header() 11695 11696 for field_to_rename, field_renamed in fields_to_rename.items(): 11697 11698 if field_to_rename in header.infos: 11699 11700 # Rename header 11701 if field_renamed is not None: 11702 header.infos[field_renamed] = vcf.parser._Info( 11703 field_renamed, 11704 header.infos[field_to_rename].num, 11705 header.infos[field_to_rename].type, 11706 header.infos[field_to_rename].desc, 11707 header.infos[field_to_rename].source, 11708 header.infos[field_to_rename].version, 11709 header.infos[field_to_rename].type_code, 11710 ) 11711 del header.infos[field_to_rename] 11712 11713 # Rename INFO patterns 11714 field_pattern = rf'(^|;)({field_to_rename})=([^;]*)' 11715 if field_renamed is not None: 11716 field_renamed_pattern = rf'\1{field_renamed}=\3' 11717 else: 11718 field_renamed_pattern = '' 11719 11720 # regexp replace 11721 regex_replace_nb += 1 11722 regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition) 11723 if (regex_replace_nb % regex_replace_partition) == 0: 11724 regex_replace = "INFO" 11725 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11726 regex_replace_dict[regex_replace_key] = regex_replace 11727 11728 # Return 11729 fields_renamed[field_to_rename] = field_renamed 11730 11731 # Log 11732 if field_renamed is not None: 11733 log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'") 11734 else: 11735 log.info(f"Rename or remove fields - field '{field_to_rename}' removed") 11736 11737 # Rename INFO 11738 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11739 log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...") 11740 query = f""" 11741 UPDATE {table} 11742 SET 11743 INFO = {regex_replace} 11744 """ 11745 log.debug(f"query={query}") 11746 self.execute_query(query=query) 11747 11748 return fields_renamed
The rename_info_fields function renames specified fields in a VCF file header and updates
corresponding INFO fields in the variants table.
Parameters
- fields_to_rename: The
fields_to_renameparameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be - table: The
tableparameter in therename_info_fieldsfunction represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns
The
rename_info_fieldsfunction returns a dictionaryfields_renamedthat contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.
11750 def calculation_rename_info_fields( 11751 self, 11752 fields_to_rename: dict = None, 11753 table: str = None, 11754 operation_name: str = "RENAME_INFO_FIELDS", 11755 ) -> None: 11756 """ 11757 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11758 fields to rename and table if provided, and then calls another function to rename the fields. 11759 11760 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11761 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11762 the key and the new field name as the value 11763 :type fields_to_rename: dict 11764 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11765 specify the name of the table for which the fields are to be renamed. It is a string type 11766 parameter 11767 :type table: str 11768 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11769 method is a string that specifies the name of the operation being performed. In this context, it 11770 is used as a default value for the operation name if not explicitly provided when calling the 11771 function, defaults to RENAME_INFO_FIELDS 11772 :type operation_name: str (optional) 11773 """ 11774 11775 # Param 11776 param = self.get_param() 11777 11778 # Get param fields to rename 11779 param_fields_to_rename = ( 11780 param.get("calculation", {}) 11781 .get("calculations", {}) 11782 .get(operation_name, {}) 11783 .get("fields_to_rename", None) 11784 ) 11785 11786 # Get param table 11787 param_table = ( 11788 param.get("calculation", {}) 11789 .get("calculations", {}) 11790 .get(operation_name, {}) 11791 .get("table", None) 11792 ) 11793 11794 # Init fields_to_rename 11795 if fields_to_rename is None: 11796 fields_to_rename = param_fields_to_rename 11797 11798 # Init table 11799 if table is None: 11800 table = param_table 11801 11802 renamed_fields = self.rename_info_fields( 11803 fields_to_rename=fields_to_rename, table=table 11804 ) 11805 11806 log.debug(f"renamed_fields:{renamed_fields}")
The calculation_rename_info_fields function retrieves parameters from a dictionary, updates
fields to rename and table if provided, and then calls another function to rename the fields.
Parameters
- fields_to_rename:
fields_to_renameis a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value - table: The
tableparameter in thecalculation_rename_info_fieldsmethod is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter - operation_name: The
operation_nameparameter in thecalculation_rename_info_fieldsmethod is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS